• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3  *
4  * For Intel/AMD x86 or x86-64 CPU (Pentium-MMX or later) and GNU C compiler.
5  *
6  * Last changed in libpng 1.2.19 August 18, 2007
7  * For conditions of distribution and use, see copyright notice in png.h
8  * Copyright (c) 1998 Intel Corporation
9  * Copyright (c) 1999-2002,2007 Greg Roelofs
10  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
11  *
12  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
13  * Interface to libpng contributed by Gilles Vollant, 1999.
14  * GNU C port by Greg Roelofs, 1999-2001.
15  *
16  * References:
17  *
18  *     http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
19  *     http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
20  *       [Intel's performance analysis of the MMX vs. non-MMX code;
21  *        moved/deleted as of 2006, but text and some graphs still
22  *        available via WayBack Machine at archive.org]
23  *
24  *     http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
25  *     http://sam.zoy.org/blog/2007-04-13-shlib-with-non-pic-code-have-inline-assembly-and-pic-mix-well
26  *     http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
27  *     http://gcc.gnu.org/onlinedocs/gcc/Variable-Attributes.html
28  *     http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
29  *     AMD64 Architecture Programmer's Manual, volumes 1 and 5
30  *       [http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739_7044,00.html]
31  *     Intel 64 and IA-32 Software Developer's Manuals
32  *       [http://developer.intel.com/products/processor/manuals/]
33  *
34  * png_read_filter_row_mmx_*() were converted in place with intel2gas 1.3.1:
35  *
36  *     intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
37  *
38  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
39  *
40  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
41  * is required to assemble the newer asm instructions such as movq.  (Version
42  * 2.5.2l.15 is definitely too old.)  See ftp://ftp.gnu.org/pub/gnu/binutils/ .
43  */
44 
45 /*
46  * PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47  * ===========================
48  *
49  * 19991006:
50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51  *
52  * 19991007:
53  *  - additional optimizations (possible or definite):
54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57  *        why subtract 8 from width_mmx in the pass 4/5 case?
58  *        (only width_mmx case) (near line 2335)
59  *     x [DONE] replace pixel_bytes within each block with the true
60  *        constant value (or are compilers smart enough to do that?)
61  *     - rewrite all MMX interlacing code so it's aligned with
62  *        the *beginning* of the row buffer, not the end.  This
63  *        would not only allow one to eliminate half of the memory
64  *        writes for odd passes (that is, pass == odd), it may also
65  *        eliminate some unaligned-data-access exceptions (assuming
66  *        there's a penalty for not aligning 64-bit accesses on
67  *        64-bit boundaries).  The only catch is that the "leftover"
68  *        pixel(s) at the end of the row would have to be saved,
69  *        but there are enough unused MMX registers in every case,
70  *        so this is not a problem.  A further benefit is that the
71  *        post-MMX cleanup code (C code) in at least some of the
72  *        cases could be done within the assembler block.
73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74  *     inconsistent, and don't match the MMX Programmer's Reference
75  *     Manual conventions anyway.  They should be changed to
76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77  *     was lowest in memory (i.e., corresponding to a left pixel)
78  *     and b7 is the byte that was highest (i.e., a right pixel).
79  *
80  * 19991016:
81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82  *     want globals prefixed by underscores when referencing them--
83  *     i.e., if the variable is const4, then refer to it as const4,
84  *     not _const4.  This seems to be a djgpp-specific requirement.
85  *     Also, such variables apparently *must* be declared outside
86  *     of functions; neither static nor automatic variables work if
87  *     defined within the scope of a single function, but both
88  *     static and truly global (multi-module) variables work fine.
89  *
90  * 19991017:
91  *  - replaced pixel_bytes in each png_memcpy() call with constant value for
92  *     inlining (png_do_read_interlace() "non-MMX/modified C code" block)
93  *
94  * 19991023:
95  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
96  *  - switched from string-concatenation-with-macros to cleaner method of
97  *     renaming global variables for djgpp--i.e., always use prefixes in
98  *     inlined assembler code (== strings) and conditionally rename the
99  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
100  *
101  * 19991024:
102  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
103  *     This one was severely weird:  even though mmxsupport() doesn't touch
104  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
105  *     the register (even in static/non-fPIC code--see below), which in turn
106  *     caused png_do_read_interlace() to return prematurely on the first row of
107  *     interlaced images (i.e., without expanding the interlaced pixels).
108  *     Inspection of the generated assembly code didn't turn up any clues,
109  *     although it did point at a minor optimization (i.e., get rid of
110  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
111  *     instruction is more destructive than it looks?  (Not yet checked.)
112  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
113  *     listings...  Apparently register spillage has to do with ebx, since
114  *     it's used to index the global offset table.  Commenting it out of the
115  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
116  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
117  *
118  * 19991107:
119  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
120  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
121  *
122  * 19991120:
123  *  - made "diff" variable (now "_dif") global to simplify conversion of
124  *     filtering routines (running out of regs, sigh).  "diff" is still used
125  *     in interlacing routines, however.
126  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
127  *     macro determines which is used); original not yet tested.
128  *
129  * 20000213:
130  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
131  *
132  * 20000319:
133  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
134  *     pass == 4 or 5, that caused visible corruption of interlaced images
135  *
136  * 20000623:
137  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
138  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
139  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
140  *     Chuck Wilson supplied a patch involving dummy output registers.  See
141  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
142  *     for the original (anonymous) SourceForge bug report.
143  *
144  * 20000706:
145  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
146  *       pnggccrd.c: In function `png_combine_row':
147  *       pnggccrd.c:525: more than 10 operands in `asm'
148  *       pnggccrd.c:669: more than 10 operands in `asm'
149  *       pnggccrd.c:828: more than 10 operands in `asm'
150  *       pnggccrd.c:994: more than 10 operands in `asm'
151  *       pnggccrd.c:1177: more than 10 operands in `asm'
152  *     They are all the same problem and can be worked around by using the
153  *     global _unmask variable unconditionally, not just in the -fPIC case.
154  *     Reportedly earlier versions of gcc also have the problem with more than
155  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
156  *
157  * 20000729:
158  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
159  *     MMX routine); began converting png_read_filter_row_mmx_sub()
160  *  - to finish remaining sections:
161  *     - clean up indentation and comments
162  *     - preload local variables
163  *     - add output and input regs (order of former determines numerical
164  *        mapping of latter)
165  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
166  *     - remove "$" from addressing of Shift and Mask variables [20000823]
167  *
168  * 20000731:
169  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
170  *
171  * 20000822:
172  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
173  *     shared-library (-fPIC) version!  Code works just fine as part of static
174  *     library.  Should have tested that sooner.
175  *     ebx is getting clobbered again (explicitly this time); need to save it
176  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
177  *
178  * 20000823:
179  *  - first section was trickiest; all remaining sections have ebx -> edx now.
180  *     (-fPIC works again.)  Also added missing underscores to various Shift*
181  *     and *Mask* globals and got rid of leading "$" signs.
182  *
183  * 20000826:
184  *  - added visual separators to help navigate microscopic printed copies
185  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
186  *     on png_read_filter_row_mmx_avg()
187  *
188  * 20000828:
189  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
190  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
191  *     cleaned up/shortened in either routine, but functionality is complete
192  *     and seems to be working fine.
193  *
194  * 20000829:
195  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
196  *     as an input reg (with dummy output variables, etc.), then it *cannot*
197  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
198  *     is simple enough...
199  *
200  * 20000914:
201  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
202  *     correctly (but 48-bit RGB just fine)
203  *
204  * 20000916:
205  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
206  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
207  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
208  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
209  *
210  * 20010101:
211  *  - added new png_init_mmx_flags() function (here only because it needs to
212  *     call mmxsupport(), which should probably become global png_mmxsupport());
213  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
214  *
215  * 20010103:
216  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
217  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
218  *
219  * 20010104:
220  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
221  *     within MMX version of png_read_filter_row()) so no longer necessary to
222  *     compile it into pngrutil.o
223  *
224  * 20010310:
225  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
226  *
227  * 20010808:
228  *  - added PNG_THREAD_UNSAFE_OK around code using global variables [GR-P]
229  *
230  * 20011124:
231  *  - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev]
232  *
233  * 20020304:
234  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
235  *
236  * 20020407:
237  *  - fixed insufficient preservation of ebx register [Sami Farin]
238  *
239  * 20040724:
240  *  - more tinkering with clobber list at lines 4529 and 5033 to get it to
241  *     compile with gcc 3.4 [GR-P]
242  *
243  * 20040809:
244  *  - added "rim" definitions for CONST4 and CONST6 [GR-P]
245  *
246  * 20060303:
247  *  - added "OS2" to list of systems that don't need leading underscores [GR-P]
248  *
249  * 20060320:
250  *  - made PIC-compliant [Christian Aichinger]
251  *
252  * 20070313:
253  *  - finally applied Giuseppe Ghib�'s 64-bit patch of 20060803 (completely
254  *     overlooked Dylan Alex Simon's similar patch of 20060414, oops...)
255  *
256  * 20070524:
257  *  - fixed link failure caused by asm-only variables being optimized out
258  *     (identified by Dimitri of Trolltech) with __attribute__((used)), which
259  *     also gets rid of warnings => nuked ugly png_squelch_warnings() hack
260  *  - dropped redundant ifdef
261  *  - moved png_mmx_support() back up where originally intended (as in
262  *     pngvcrd.c), using __attribute__((noinline)) in extra prototype
263  *
264  * 20070527:
265  *  - revised png_combine_row() to reuse mask in lieu of external _unmask
266  *  - moved 32-bit (RGBA) case to top of png_combine_row():  most common
267  *  - just about ready to give up on x86-64 -fPIC mode; can't even access 16
268  *     _mask*_* constants without triggering link error on shared library:
269  *       /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local
270  *         symbol' can not be used when making a shared object; recompile with
271  *         -fPIC
272  *       pnggccrd.pic.o: could not read symbols: Bad value
273  *       ("objdump -x pnggccrd.pic.o | grep rodata" to verify)
274  *     [might be able to work around by doing within assembly code whatever
275  *     -fPIC does, but given problems to date, seems like long shot...]
276  *     [relevant ifdefs:  __x86_64__ && __PIC__ => C code only]
277  *  - changed #if 0 to #ifdef PNG_CLOBBER_MMX_REGS_SUPPORTED in case gcc ever
278  *     supports MMX regs (%mm0, etc.) in clobber list (not supported by gcc
279  *     2.7.2.3, 2.91.66 (egcs 1.1.2), 3.x, or 4.1.2)
280  *
281  * 20070603:
282  *  - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64
283  *     struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link
284  *     above for details
285  *  - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and
286  *     _amask7_1_0, respectively
287  *  - can't figure out how to use _c64._mask*_* vars within asm code, so still
288  *     need single variables for non-x86-64/-fPIC half :-(
289  *  - replaced various __PIC__ ifdefs with *_GOT_ebx macros
290  *  - moved _LBCarryMask and _HBClearMask into _c64 struct
291  *  - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP
292  *     and CLOBBER_r1*d macros)
293  *
294  * 20070604:
295  *  - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts
296  *     (_amask naming convention:  numbers of 00-bytes, ff-bytes, 00-bytes)
297  *    - _ActiveMask     // (10) // avg/paeth/sub; read-only; consts; movq/pand
298  *       0x0000000000ffffffLL (bpp 3, avg)      _amask5_3_0
299  *       0xffffffffffffffffLL (bpp 4, 6, avg)   _amask0_8_0
300  *       0x000000000000ffffLL (bpp 2, avg)      _amask6_2_0
301  *       0x0000000000ffffffLL (bpp 3, paeth)    _amask5_3_0
302  *       0x00000000ffffffffLL (bpp 6, paeth)    _amask4_4_0
303  *       0x00000000ffffffffLL (bpp 4, paeth)    _amask4_4_0
304  *       0x00000000ffffffffLL (bpp 8, paeth)    _amask4_4_0
305  *       0x0000ffffff000000LL (bpp 3, sub)      _amask2_3_3
306  *       0x00000000ffff0000LL (bpp 2, sub)      _amask4_2_2
307  *    - _ActiveMaskEnd  // (1)  // paeth only; read-only; const; pand
308  *       0xffff000000000000LL (bpp 3, paeth)    _amask0_2_6
309  *  - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)"
310  *     lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing
311  *
312  * 20070605:
313  *  - merged PNG_x86_64_USE_GOTPCREL, non-PNG_x86_64_USE_GOTPCREL code via
314  *     *MASK* and LOAD/RESTORE macros
315  *
316  * 20070607:
317  *  - replaced all constant instances of _ShiftBpp, _ShiftRem with immediates
318  *     (still have two shared cases in avg, sub routines)
319  *
320  * 20070609:
321  *  - replaced remaining instances of _ShiftBpp, _ShiftRem with immediates
322  *     (split sub and avg 4/6-bpp cases into separate blocks)
323  *  - fixed paeth bug due to clobbered r11/r12/r13 regs
324  *
325  * 20070610:
326  *  - made global "_dif" variable (avg/paeth/sub routines) local again (now
327  *     "diff"--see 19991120 entry above), using register constraints
328  *  - note that %ebp in clobber list doesn't actually work, at least for 32-bit
329  *     version and gcc 4.1.2; must save and restore manually.  (Seems to work
330  *     OK for 64-bit version and gcc 3.4.3, but gcc may not be using ebp/rbp
331  *     in that case.)
332  *  - started replacing direct _MMXLength accesses with register constraints
333  *
334  * 20070612:
335  *  - continued replacing direct _MMXLength accesses with register constraints
336  *
337  * 20070613:
338  *  - finished replacing direct _MMXLength accesses with register constraints;
339  *     switched to local variable (and renamed back to MMXLength)
340  *
341  * 20070614:
342  *  - fixed sub bpp = 1 bug
343  *  - started replacing direct _FullLength accesses with register constraints
344  *
345  * 20070615:
346  *  - fixed 64-bit paeth bpp 3 crash bug (misplaced LOAD_GOT_rbp)
347  *  - fixed 64-bit paeth bpp 1/2 and cleanup-block crash bugs (misplaced
348  *     RESTORE_r11_r12_r13)
349  *  - slightly optimized avg/paeth cleanup blocks and paeth bpp 1/2 block
350  *     (save/restore ebx only if needed)
351  *  - continued replacing direct _FullLength accesses with register constraints
352  *
353  * 20070616:
354  *  - finished replacing direct _FullLength accesses with register constraints
355  *     (*ugly* conditional clobber-separator macros for avg and paeth, sigh)
356  *
357  * 20070618:
358  *  - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/
359  *     RESTORE_rbp in 32-bit thread-safe case)
360  *  - changed all "ifdef *" to "if defined(*)" [GR-P]
361  *
362  * 20070619:
363  *  - rearranged most bitdepth-related case statements to put most frequent
364  *     cases at top (24-bit, 32-bit, 8-bit, rest)
365  *
366  * 20070623:
367  *  - cleaned up png_debug() warnings/formatting
368  *  - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef
369  *     (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19)
370  *  - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct
371  *     member (row_buf_size)
372  *  - rearranged pass-related if-blocks in png_do_read_interlace() to put most
373  *     frequent cases (4, 5) at top [GR-P suggestion]
374  *
375  * 20070624-29:
376  *  - fixed 64-bit crash bug:  pointers -> rsi/rdi, not esi/edi (switched to
377  *     %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/
378  *     inc/sub/mov instructions; changed dummy vars to pointers)
379  *     - png_combine_row()
380  *     - png_do_read_interlace()
381  *     - png_read_filter_row_mmx_avg()
382  *     - png_read_filter_row_mmx_paeth()
383  *     - png_read_filter_row_mmx_sub()
384  *     - png_read_filter_row_mmx_up()
385  *  - NOTE:  this fix makes use of the fact that modifying a 32-bit reg (e.g.,
386  *     %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so
387  *     it's safe to mix 32-bit operations with 64-bit base/index addressing
388  *     (see new PSI/PAX/PBX/PDX/PBP/etc. "pointer-register" macros); applies
389  *     also to clobber lists
390  *
391  * 20070630:
392  *  - cleaned up formatting, macros, minor png_read_filter_row_mmx_sub() 8-bpp
393  *     register-usage inefficiency
394  *  - fixed 32-bit png_do_read_interlace() bug (was using pointer size for
395  *     64-bit dummy values)
396  *
397  * 20070703:
398  *  - added check for (manual) PIC macro to fix OpenBSD crash bug
399  *
400  * 20070717:
401  *  - fixed 48-bit png_combine_row() bug (was acting like 32-bit):  copy 6
402  *     bytes per pixel, not 4, and use stride of 6, not 4, in the second loop
403  *     of interlace processing of 48-bit pixels [GR-P]
404  *
405  * 20070722:
406  *  - fixed 64-bit png_uint_32 bug with MMXLength/FullLength temp vars
407  *
408  * [still broken:  tops of all row-filter blocks (input/output constraints);
409  *  shows up on 64-bit dynamic (-fPIC) version with -O2, especially if debug-
410  *  printfs enabled, but at right edge of odd-width images even if disabled]
411  *
412  *
413  * STILL TO DO:
414  *  - fix final thread-unsafe code using stack vars and pointer? (paeth top,
415  *     default, bottom only:  default, bottom already 5 reg constraints; could
416  *     replace bpp with pointer and group bpp/patemp/pbtemp/pctemp in array)
417  *  - fix ebp/no-reg-constraint inefficiency (avg/paeth/sub top)
418  *  - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
419  *  - write MMX code for 48-bit case (pixel_bytes == 6)
420  *  - figure out what's up with 24-bit case (pixel_bytes == 3):
421  *     why subtract 8 from width_mmx in the pass 4/5 case?  due to
422  *     odd number of bytes? (only width_mmx case) (near line 2335)
423  *  - rewrite all MMX interlacing code so it's aligned with beginning
424  *     of the row buffer, not the end (see 19991007 for details)
425  *  - add error messages to any remaining bogus default cases
426  *  - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
427  *  - try =r, etc., as reg constraints?  (would gcc use 64-bit ones on x86-64?)
428  *  - need full, non-graphical, CRC-based test suite...  maybe autogenerate
429  *     random data of various height/width/depth, compute CRCs, write (C
430  *     funcs), read (asm/MMX), recompute CRCs, and compare?
431  *  - write true x86-64 version using 128-bit "media instructions", %xmm0-15,
432  *     and extra general-purpose registers
433  */
434 
435 #if defined(__GNUC__)
436 
437 #define PNG_INTERNAL
438 #include "png.h"
439 
440 
441 /* for some inexplicable reason, gcc 3.3.5 on OpenBSD (and elsewhere?) does
442  * *not* define __PIC__ when the -fPIC option is used, so we have to rely on
443  * makefiles and whatnot to define the PIC macro explicitly */
444 #if defined(PIC) && !defined(__PIC__)   // (this can/should move to pngconf.h)
445 #  define __PIC__
446 #endif
447 
448 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
449 
450 /* if you want/need full thread-safety on x86-64 even when linking statically,
451  * comment out the "&& defined(__PIC__)" part here: */
452 #if defined(__x86_64__) && defined(__PIC__)
453 #  define PNG_x86_64_USE_GOTPCREL            // GOTPCREL => full thread-safety
454 #  define PNG_CLOBBER_x86_64_REGS_SUPPORTED  // works as of gcc 3.4.3 ...
455 #endif
456 
457 int PNGAPI png_mmx_support(void);
458 
459 #if defined(PNG_USE_LOCAL_ARRAYS)
460 static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
461 static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
462 static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
463 #endif
464 
465 /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
466  * so define them without: */
467 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
468     defined(__OS2__)
469 #  define _mmx_supported  mmx_supported
470 #  define _mask8_0        mask8_0
471 #  define _mask16_1       mask16_1
472 #  define _mask16_0       mask16_0
473 #  define _mask24_2       mask24_2
474 #  define _mask24_1       mask24_1
475 #  define _mask24_0       mask24_0
476 #  define _mask32_3       mask32_3
477 #  define _mask32_2       mask32_2
478 #  define _mask32_1       mask32_1
479 #  define _mask32_0       mask32_0
480 #  define _mask48_5       mask48_5
481 #  define _mask48_4       mask48_4
482 #  define _mask48_3       mask48_3
483 #  define _mask48_2       mask48_2
484 #  define _mask48_1       mask48_1
485 #  define _mask48_0       mask48_0
486 #  define _amask5_3_0     amask5_3_0
487 #  define _amask7_1_0     amask7_1_0
488 #  define _LBCarryMask    LBCarryMask
489 #  define _HBClearMask    HBClearMask
490 #  define _amask0_8_0     amask0_8_0
491 #  define _amask6_2_0     amask6_2_0
492 #  define _amask4_4_0     amask4_4_0
493 #  define _amask0_2_6     amask0_2_6
494 #  define _amask2_3_3     amask2_3_3
495 #  define _amask4_2_2     amask4_2_2
496 #  if defined(PNG_THREAD_UNSAFE_OK)
497 #    define _patemp       patemp
498 #    define _pbtemp       pbtemp
499 #    define _pctemp       pctemp
500 #  endif
501 #endif // djgpp, Win32, Cygwin, OS2
502 
503 
504 /* These constants are used in the inlined MMX assembly code. */
505 
506 typedef unsigned long long  ull;
507 
508 #if defined(PNG_x86_64_USE_GOTPCREL)
509 static PNG_CONST struct {
510     //ull _mask_array[26];
511 
512     // png_combine_row() constants:
513     ull _mask8_0;
514     ull _mask16_0, _mask16_1;
515     ull _mask24_0, _mask24_1, _mask24_2;
516     ull _mask32_0, _mask32_1, _mask32_2, _mask32_3;
517     ull _mask48_0, _mask48_1, _mask48_2, _mask48_3, _mask48_4, _mask48_5;
518 
519     // png_do_read_interlace() constants:
520     ull _amask5_3_0, _amask7_1_0;  // was _const4 and _const6, respectively
521 
522     // png_read_filter_row_mmx_avg() constants (also uses _amask5_3_0):
523     ull _LBCarryMask, _HBClearMask;
524     ull _amask0_8_0, _amask6_2_0;  // was ActiveMask for bpp 4/6 and 2 cases
525 
526     // png_read_filter_row_mmx_paeth() constants (also uses _amask5_3_0):
527     ull _amask4_4_0, _amask0_2_6;  // was ActiveMask{,End} for bpp 6/4/8 and 3
528 
529     // png_read_filter_row_mmx_sub() constants:
530     ull _amask2_3_3, _amask4_2_2;  // was ActiveMask for bpp 3 and 2 cases
531 
532 } _c64 __attribute__((used, aligned(8))) = {
533 
534     // png_combine_row() constants:
535     0x0102040810204080LL, // _mask8_0      offset 0
536 
537     0x1010202040408080LL, // _mask16_0     offset 8
538     0x0101020204040808LL, // _mask16_1     offset 16
539 
540     0x2020404040808080LL, // _mask24_0     offset 24
541     0x0408080810101020LL, // _mask24_1     offset 32
542     0x0101010202020404LL, // _mask24_2     offset 40
543 
544     0x4040404080808080LL, // _mask32_0     offset 48
545     0x1010101020202020LL, // _mask32_1     offset 56
546     0x0404040408080808LL, // _mask32_2     offset 64
547     0x0101010102020202LL, // _mask32_3     offset 72
548 
549     0x4040808080808080LL, // _mask48_0     offset 80
550     0x2020202040404040LL, // _mask48_1     offset 88
551     0x1010101010102020LL, // _mask48_2     offset 96
552     0x0404080808080808LL, // _mask48_3     offset 104
553     0x0202020204040404LL, // _mask48_4     offset 112
554     0x0101010101010202LL, // _mask48_5     offset 120
555 
556     // png_do_read_interlace() constants:
557     0x0000000000FFFFFFLL, // _amask5_3_0   offset 128  (bpp 3, avg/paeth) const4
558     0x00000000000000FFLL, // _amask7_1_0   offset 136                     const6
559 
560     // png_read_filter_row_mmx_avg() constants:
561     0x0101010101010101LL, // _LBCarryMask  offset 144
562     0x7F7F7F7F7F7F7F7FLL, // _HBClearMask  offset 152
563     0xFFFFFFFFFFFFFFFFLL, // _amask0_8_0   offset 160  (bpp 4/6, avg)
564     0x000000000000FFFFLL, // _amask6_2_0   offset 168  (bpp 2,   avg)
565 
566     // png_read_filter_row_mmx_paeth() constants:
567     0x00000000FFFFFFFFLL, // _amask4_4_0   offset 176  (bpp 6/4/8, paeth)
568     0xFFFF000000000000LL, // _amask0_2_6   offset 184  (bpp 3, paeth)   A.M.End
569 
570     // png_read_filter_row_mmx_sub() constants:
571     0x0000FFFFFF000000LL, // _amask2_3_3   offset 192  (bpp 3, sub)
572     0x00000000FFFF0000LL, // _amask4_2_2   offset 200  (bpp 2, sub)
573 
574 };
575 
576 #define MASK8_0        "(%%rbp)"
577 #define MASK16_0       "8(%%rbp)"
578 #define MASK16_1       "16(%%rbp)"
579 #define MASK24_0       "24(%%rbp)"
580 #define MASK24_1       "32(%%rbp)"
581 #define MASK24_2       "40(%%rbp)"
582 #define MASK32_0       "48(%%rbp)"
583 #define MASK32_1       "56(%%rbp)"
584 #define MASK32_2       "64(%%rbp)"
585 #define MASK32_3       "72(%%rbp)"
586 #define MASK48_0       "80(%%rbp)"
587 #define MASK48_1       "88(%%rbp)"
588 #define MASK48_2       "96(%%rbp)"
589 #define MASK48_3       "104(%%rbp)"
590 #define MASK48_4       "112(%%rbp)"
591 #define MASK48_5       "120(%%rbp)"
592 #define AMASK5_3_0     "128(%%rbp)"
593 #define AMASK7_1_0     "136(%%rbp)"
594 #define LB_CARRY_MASK  "144(%%rbp)"
595 #define HB_CLEAR_MASK  "152(%%rbp)"
596 #define AMASK0_8_0     "160(%%rbp)"
597 #define AMASK6_2_0     "168(%%rbp)"
598 #define AMASK4_4_0     "176(%%rbp)"
599 #define AMASK0_2_6     "184(%%rbp)"
600 #define AMASK2_3_3     "192(%%rbp)"
601 #define AMASK4_2_2     "200(%%rbp)"
602 
603 #else // !PNG_x86_64_USE_GOTPCREL
604 
605 static PNG_CONST ull _mask8_0  __attribute__((used, aligned(8))) = 0x0102040810204080LL;
606 
607 static PNG_CONST ull _mask16_1 __attribute__((used, aligned(8))) = 0x0101020204040808LL;
608 static PNG_CONST ull _mask16_0 __attribute__((used, aligned(8))) = 0x1010202040408080LL;
609 
610 static PNG_CONST ull _mask24_2 __attribute__((used, aligned(8))) = 0x0101010202020404LL;
611 static PNG_CONST ull _mask24_1 __attribute__((used, aligned(8))) = 0x0408080810101020LL;
612 static PNG_CONST ull _mask24_0 __attribute__((used, aligned(8))) = 0x2020404040808080LL;
613 
614 static PNG_CONST ull _mask32_3 __attribute__((used, aligned(8))) = 0x0101010102020202LL;
615 static PNG_CONST ull _mask32_2 __attribute__((used, aligned(8))) = 0x0404040408080808LL;
616 static PNG_CONST ull _mask32_1 __attribute__((used, aligned(8))) = 0x1010101020202020LL;
617 static PNG_CONST ull _mask32_0 __attribute__((used, aligned(8))) = 0x4040404080808080LL;
618 
619 static PNG_CONST ull _mask48_5 __attribute__((used, aligned(8))) = 0x0101010101010202LL;
620 static PNG_CONST ull _mask48_4 __attribute__((used, aligned(8))) = 0x0202020204040404LL;
621 static PNG_CONST ull _mask48_3 __attribute__((used, aligned(8))) = 0x0404080808080808LL;
622 static PNG_CONST ull _mask48_2 __attribute__((used, aligned(8))) = 0x1010101010102020LL;
623 static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x2020202040404040LL;
624 static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL;
625 
626 // png_do_read_interlace() constants:
627 static PNG_CONST ull _amask5_3_0  __attribute__((aligned(8))) = 0x0000000000FFFFFFLL;  // was _const4
628 static PNG_CONST ull _amask7_1_0  __attribute__((aligned(8))) = 0x00000000000000FFLL;  // was _const6
629 
630 // png_read_filter_row_mmx_avg() constants:
631 static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL;
632 static PNG_CONST ull _HBClearMask __attribute__((used, aligned(8))) = 0x7f7f7f7f7f7f7f7fLL;
633 static PNG_CONST ull _amask0_8_0  __attribute__((used, aligned(8))) = 0xFFFFFFFFFFFFFFFFLL;
634 static PNG_CONST ull _amask6_2_0  __attribute__((used, aligned(8))) = 0x000000000000FFFFLL;
635 
636 // png_read_filter_row_mmx_paeth() constants:
637 static PNG_CONST ull _amask4_4_0  __attribute__((used, aligned(8))) = 0x00000000FFFFFFFFLL;
638 static PNG_CONST ull _amask0_2_6  __attribute__((used, aligned(8))) = 0xFFFF000000000000LL;
639 
640 // png_read_filter_row_mmx_sub() constants:
641 static PNG_CONST ull _amask2_3_3  __attribute__((used, aligned(8))) = 0x0000FFFFFF000000LL;
642 static PNG_CONST ull _amask4_2_2  __attribute__((used, aligned(8))) = 0x00000000FFFF0000LL;
643 
644 #define MASK8_0        "_mask8_0"
645 #define MASK16_0       "_mask16_0"
646 #define MASK16_1       "_mask16_1"
647 #define MASK24_0       "_mask24_0"
648 #define MASK24_1       "_mask24_1"
649 #define MASK24_2       "_mask24_2"
650 #define MASK32_0       "_mask32_0"
651 #define MASK32_1       "_mask32_1"
652 #define MASK32_2       "_mask32_2"
653 #define MASK32_3       "_mask32_3"
654 #define MASK48_0       "_mask48_0"
655 #define MASK48_1       "_mask48_1"
656 #define MASK48_2       "_mask48_2"
657 #define MASK48_3       "_mask48_3"
658 #define MASK48_4       "_mask48_4"
659 #define MASK48_5       "_mask48_5"
660 #define AMASK5_3_0     "_amask5_3_0"
661 #define AMASK7_1_0     "_amask7_1_0"
662 #define LB_CARRY_MASK  "_LBCarryMask"
663 #define HB_CLEAR_MASK  "_HBClearMask"
664 #define AMASK0_8_0     "_amask0_8_0"
665 #define AMASK6_2_0     "_amask6_2_0"
666 #define AMASK4_4_0     "_amask4_4_0"
667 #define AMASK0_2_6     "_amask0_2_6"
668 #define AMASK2_3_3     "_amask2_3_3"
669 #define AMASK4_2_2     "_amask4_2_2"
670 
671 #endif // ?PNG_x86_64_USE_GOTPCREL
672 
673 
674 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW) || defined(PNG_HAVE_MMX_COMBINE_ROW)
675 
676 // this block is specific to png_read_filter_row_mmx_paeth() except for
677 // LOAD_GOT_rbp and RESTORE_rbp, which are also used in png_combine_row()
678 #if defined(PNG_x86_64_USE_GOTPCREL)
679 #  define pa_TEMP                "%%r11d"
680 #  define pb_TEMP                "%%r12d"
681 #  define pc_TEMP                "%%r13d"
682 #  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)  // works as of gcc 3.4.3 ...
683 #    define SAVE_r11_r12_r13
684 #    define RESTORE_r11_r12_r13
685 #    define _CLOBBER_r11_r12_r13 ,"%r11", "%r12", "%r13"
686 #    define CLOBBER_r11_r12_r13  "%r11", "%r12", "%r13"
687 #  else // !PNG_CLOBBER_x86_64_REGS_SUPPORTED
688 #    define SAVE_r11_r12_r13     "pushq %%r11  \n\t" \
689                                  "pushq %%r12  \n\t" \
690                                  "pushq %%r13  \n\t"  // "normally 0-extended"
691 #    define RESTORE_r11_r12_r13  "popq  %%r13  \n\t" \
692                                  "popq  %%r12  \n\t" \
693                                  "popq  %%r11  \n\t"
694 #    define _CLOBBER_r11_r12_r13
695 #    define CLOBBER_r11_r12_r13
696 #  endif
697 #  define LOAD_GOT_rbp           "pushq %%rbp                        \n\t" \
698                                  "movq  _c64@GOTPCREL(%%rip), %%rbp  \n\t"
699 #  define RESTORE_rbp            "popq  %%rbp                        \n\t"
700 #else // 32-bit and/or non-PIC
701 #  if defined(PNG_THREAD_UNSAFE_OK)
702      // These variables are used in png_read_filter_row_mmx_paeth() and would be
703      //   local variables if not for gcc-inline-assembly addressing limitations
704      //   (some apparently related to ELF format, others to CPU type).
705      //
706      // WARNING: Their presence defeats the thread-safety of libpng.
707      static int                     _patemp  __attribute__((used));
708      static int                     _pbtemp  __attribute__((used));
709      static int                     _pctemp  __attribute__((used));
710 #    define pa_TEMP                "_patemp"
711 #    define pb_TEMP                "_pbtemp"  // temp variables for
712 #    define pc_TEMP                "_pctemp"  //  Paeth routine
713 #    define SAVE_r11_r12_r13
714 #    define RESTORE_r11_r12_r13
715 #    define _CLOBBER_r11_r12_r13   // not using regs => not clobbering
716 #    define CLOBBER_r11_r12_r13
717 #  endif // PNG_THREAD_UNSAFE_OK
718 #  define LOAD_GOT_rbp
719 #  define RESTORE_rbp
720 #endif
721 
722 #if defined(__x86_64__)
723 #  define SAVE_ebp
724 #  define RESTORE_ebp
725 #  define _CLOBBER_ebp         ,"%ebp"
726 #  define CLOBBER_ebp          "%ebp"
727 #  define SAVE_FullLength      "movl %%eax, %%r15d  \n\t"
728 #  define RESTORE_FullLength   "movl %%r15d, "     // may go into eax or ecx
729 #  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)   // works as of gcc 3.4.3 ...
730 #    define SAVE_r15
731 #    define RESTORE_r15
732 #    define _CLOBBER_r15       ,"%r15"
733 #  else
734 #    define SAVE_r15           "pushq %%r15  \n\t"
735 #    define RESTORE_r15        "popq  %%r15  \n\t"
736 #    define _CLOBBER_r15
737 #  endif
738 #  define PBP                  "%%rbp"             // regs used for 64-bit
739 #  define PAX                  "%%rax"             //  pointers or in
740 #  define PBX                  "%%rbx"             //  combination with
741 #  define PCX                  "%%rcx"             //  64-bit pointer-regs
742 #  define PDX                  "%%rdx"             //  (base/index pairs,
743 #  define PSI                  "%%rsi"             //  add/sub/mov pairs)
744 #  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffffffffffff8, "
745 #else
746 #  define SAVE_ebp             "pushl %%ebp \n\t"  // clobber list doesn't work
747 #  define RESTORE_ebp          "popl  %%ebp \n\t"  //  for %ebp on 32-bit; not
748 #  define _CLOBBER_ebp                             //  clear why not
749 #  define CLOBBER_ebp
750 #  define SAVE_FullLength      "pushl %%eax \n\t"
751 #  define RESTORE_FullLength   "popl "             // eax (avg) or ecx (paeth)
752 #  define SAVE_r15
753 #  define RESTORE_r15
754 #  define _CLOBBER_r15
755 #  define PBP                  "%%ebp"             // regs used for or in
756 #  define PAX                  "%%eax"             //  combination with
757 #  define PBX                  "%%ebx"             //  "normal," 32-bit
758 #  define PCX                  "%%ecx"             //  pointers
759 #  define PDX                  "%%edx"
760 #  define PSI                  "%%esi"
761 #  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffff8, "
762 #endif
763 
764 // CLOB_COMMA_ebx_ebp:  need comma ONLY if both CLOBBER_ebp and CLOBBER_GOT_ebx
765 //                      have values, i.e., only if __x86_64__ AND !__PIC__
766 #if defined(__x86_64__) && !defined(__PIC__)
767 #  define CLOB_COMMA_ebx_ebp    , // clobbering both ebp and ebx => need comma
768 #else
769 #  define CLOB_COMMA_ebx_ebp
770 #endif
771 
772 // CLOB_COMMA_ebX_r1X:  need comma UNLESS both CLOBBER_ebp and CLOBBER_GOT_ebx
773 //                   are empty OR CLOBBER_r11_r12_r13 is empty--i.e., NO comma
774 //                   if (!__x86_64__ AND __PIC__) OR !(PNG_x86_64_USE_GOTPCREL
775 //                   AND PNG_CLOBBER_x86_64_REGS_SUPPORTED)   (double sigh...)
776 #if (!defined(__x86_64__) && defined(__PIC__)) || \
777     !defined(PNG_x86_64_USE_GOTPCREL) || \
778     !defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)
779 #  define CLOB_COMMA_ebX_r1X
780 #else
781 #  define CLOB_COMMA_ebX_r1X    , // clobbering (ebp OR ebx) AND r11_r12_r13
782 #endif
783 
784 // CLOB_COLON_ebx_ebp:  need colon unless CLOBBER_ebp and CLOBBER_GOT_ebx are
785 //                      BOTH empty--i.e., NO colon if (!__x86_64__ AND __PIC__)
786 // CLOB_COLON_ebx_ebp_r1X:  if, in addition, CLOBBER_r11_r12_r13 is empty, then
787 //                          no colon for Paeth blocks, either--i.e., NO colon
788 //                          if !(PNG_x86_64_USE_GOTPCREL AND
789 //                               PNG_CLOBBER_x86_64_REGS_SUPPORTED)
790 #if (!defined(__x86_64__) && defined(__PIC__))
791 #  define CLOB_COLON_ebx_ebp
792 #  if !(defined(PNG_x86_64_USE_GOTPCREL) && \
793         defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED))
794 #    define CLOB_COLON_ebx_ebp_r1X
795 #  else
796 #    define CLOB_COLON_ebx_ebp_r1X  : // clobbering ebp OR ebx OR r11_r12_r13
797 #  endif
798 #else
799 #  define CLOB_COLON_ebx_ebp        : // clobbering ebp OR ebx
800 #  define CLOB_COLON_ebx_ebp_r1X    : // clobbering ebp OR ebx OR r11_r12_r13
801 #endif
802 
803 #endif // PNG_HAVE_MMX_READ_FILTER_ROW
804 
805 #if defined(__PIC__)  // macros to save, restore index to Global Offset Table
806 #  if defined(__x86_64__)
807 #    define SAVE_GOT_ebx     "pushq %%rbx \n\t"
808 #    define RESTORE_GOT_ebx  "popq  %%rbx \n\t"
809 #  else
810 #    define SAVE_GOT_ebx     "pushl %%ebx \n\t"
811 #    define RESTORE_GOT_ebx  "popl  %%ebx \n\t"
812 #  endif
813 #  define _CLOBBER_GOT_ebx   // explicitly saved, restored => not clobbered
814 #  define CLOBBER_GOT_ebx
815 #else
816 #  define SAVE_GOT_ebx
817 #  define RESTORE_GOT_ebx
818 #  define _CLOBBER_GOT_ebx   ,"%ebx"
819 #  define CLOBBER_GOT_ebx    "%ebx"
820 #endif
821 
822 #if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE)
823 #  define BPP2  2
824 #  define BPP3  3  // bytes per pixel (a.k.a. pixel_bytes)
825 #  define BPP4  4  // (defined only to help avoid cut-and-paste errors)
826 #  define BPP6  6
827 #  define BPP8  8
828 #endif
829 
830 
831 
832 static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
833 
834 /*===========================================================================*/
835 /*                                                                           */
836 /*                      P N G _ M M X _ S U P P O R T                        */
837 /*                                                                           */
838 /*===========================================================================*/
839 
840 // GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
841 //             (2) all instructions compile with gcc 2.7.2.3 and later
842 //           x (3) the function is moved down here to prevent gcc from
843 //           x      inlining it in multiple places and then barfing be-
844 //           x      cause the ".NOT_SUPPORTED" label is multiply defined
845 //                  [need to retest with gcc 2.7.2.3]
846 
847 // GRR 20070524:  This declaration apparently is compatible with but supersedes
848 //   the one in png.h; in any case, the generated object file is slightly
849 //   smaller.  It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
850 //   replicated the ".NOT_SUPPORTED" label in each location the function was
851 //   inlined, leading to compilation errors due to the "multiply defined"
852 //   label.  Old workaround was to leave the function at the end of this
853 //   file; new one (still testing) is to use a gcc-specific function attribute
854 //   to prevent local inlining.
855 int PNGAPI
856 png_mmx_support(void) __attribute__((noinline));
857 
858 int PNGAPI
png_mmx_support(void)859 png_mmx_support(void)
860 {
861 #if defined(PNG_MMX_CODE_SUPPORTED)  // superfluous, but what the heck
862     int result;
863     __asm__ __volatile__ (
864 #if defined(__x86_64__)
865         "pushq %%rbx          \n\t"  // rbx gets clobbered by CPUID instruction
866         "pushq %%rcx          \n\t"  // so does rcx...
867         "pushq %%rdx          \n\t"  // ...and rdx (but rcx & rdx safe on Linux)
868         "pushfq               \n\t"  // save Eflag to stack
869         "popq %%rax           \n\t"  // get Eflag from stack into rax
870         "movq %%rax, %%rcx    \n\t"  // make another copy of Eflag in rcx
871         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
872         "pushq %%rax          \n\t"  // save modified Eflag back to stack
873         "popfq                \n\t"  // restore modified value to Eflag reg
874         "pushfq               \n\t"  // save Eflag to stack
875         "popq %%rax           \n\t"  // get Eflag from stack
876         "pushq %%rcx          \n\t"  // save original Eflag to stack
877         "popfq                \n\t"  // restore original Eflag
878 #else
879         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
880         "pushl %%ecx          \n\t"  // so does ecx...
881         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
882         "pushfl               \n\t"  // save Eflag to stack
883         "popl %%eax           \n\t"  // get Eflag from stack into eax
884         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
885         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
886         "pushl %%eax          \n\t"  // save modified Eflag back to stack
887         "popfl                \n\t"  // restore modified value to Eflag reg
888         "pushfl               \n\t"  // save Eflag to stack
889         "popl %%eax           \n\t"  // get Eflag from stack
890         "pushl %%ecx          \n\t"  // save original Eflag to stack
891         "popfl                \n\t"  // restore original Eflag
892 #endif
893         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
894         "jz 0f                \n\t"  // if same, CPUID instr. is not supported
895 
896         "xorl %%eax, %%eax    \n\t"  // set eax to zero
897 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
898         "cpuid                \n\t"  // get the CPU identification info
899         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
900         "jl 0f                \n\t"  // if eax is zero, MMX is not supported
901 
902         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
903         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
904                                      // faster than the instruction "mov eax, 1"
905         "cpuid                \n\t"  // get the CPU identification info again
906         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
907         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
908         "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
909 
910         "movl $1, %%eax       \n\t"  // set return value to 1
911         "jmp  1f              \n\t"  // DONE:  have MMX support
912 
913     "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
914         "movl $0, %%eax       \n\t"  // set return value to 0
915     "1:                       \n\t"  // .RETURN: target label for jump instructions
916 #if defined(__x86_64__)
917         "popq %%rdx           \n\t"  // restore rdx
918         "popq %%rcx           \n\t"  // restore rcx
919         "popq %%rbx           \n\t"  // restore rbx
920 #else
921         "popl %%edx           \n\t"  // restore edx
922         "popl %%ecx           \n\t"  // restore ecx
923         "popl %%ebx           \n\t"  // restore ebx
924 #endif
925 
926 //      "ret                  \n\t"  // DONE:  no MMX support
927                                      // (fall through to standard C "ret")
928 
929         : "=a" (result)              // output list
930 
931         :                            // any variables used on input (none)
932 
933                                      // no clobber list
934 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
935 //      , "memory"   // if write to a variable gcc thought was in a reg
936 //      , "cc"       // "condition codes" (flag bits)
937     );
938     _mmx_supported = result;
939 #else
940     _mmx_supported = 0;
941 #endif /* PNG_MMX_CODE_SUPPORTED */
942 
943     return _mmx_supported;
944 }
945 
946 
947 /*===========================================================================*/
948 /*                                                                           */
949 /*                       P N G _ C O M B I N E _ R O W                       */
950 /*                                                                           */
951 /*===========================================================================*/
952 
953 #if defined(PNG_HAVE_MMX_COMBINE_ROW)
954 
955 /* Combines the row recently read in with the previous row.
956    This routine takes care of alpha and transparency if requested.
957    This routine also handles the two methods of progressive display
958    of interlaced images, depending on the mask value.
959    The mask value describes which pixels are to be combined with
960    the row.  The pattern always repeats every 8 pixels, so just 8
961    bits are needed.  A one indicates the pixel is to be combined; a
962    zero indicates the pixel is to be skipped.  This is in addition
963    to any alpha or transparency value associated with the pixel.
964    If you want all pixels to be combined, pass 0xff (255) in mask. */
965 
966 /* Use this routine for the x86 platform - it uses a faster MMX routine
967    if the machine supports MMX. */
968 
969 void /* PRIVATE */
png_combine_row(png_structp png_ptr,png_bytep row,int mask)970 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
971 {
972    int dummy_value_a;    // fix 'forbidden register spilled' error
973    int dummy_value_c;
974    int dummy_value_d;
975    png_bytep dummy_value_S;
976    png_bytep dummy_value_D;
977 
978    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
979 
980    if (_mmx_supported == 2) {
981 #if !defined(PNG_1_0_X)
982        /* this should have happened in png_init_mmx_flags() already */
983        png_warning(png_ptr, "asm_flags may not have been initialized");
984 #endif
985        png_mmx_support();
986    }
987 
988    if (mask == 0xff)
989    {
990       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
991       png_memcpy(row, png_ptr->row_buf + 1,
992        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
993    }
994    else   /* (png_combine_row() is never called with mask == 0) */
995    {
996       switch (png_ptr->row_info.pixel_depth)
997       {
998          case 24:       /* png_ptr->row_info.pixel_depth */
999          {
1000             png_bytep srcptr;
1001             png_bytep dstptr;
1002 
1003 #if !defined(PNG_1_0_X)
1004             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1005 #else
1006             if (_mmx_supported)
1007 #endif
1008             {
1009                png_uint_32 len;
1010                int diff;
1011 
1012                srcptr = png_ptr->row_buf + 1;
1013                dstptr = row;
1014                len  = png_ptr->width & ~7;          // reduce to multiple of 8
1015                diff = (int) (png_ptr->width & 7);   // amount lost
1016 
1017                __asm__ __volatile__ (
1018                   "not       %%edx            \n\t" // mask => unmask
1019                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
1020                   "not       %%edx            \n\t" // unmask => mask for later
1021                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1022                   "punpcklbw %%mm7, %%mm7     \n\t"
1023                   "punpcklwd %%mm7, %%mm7     \n\t"
1024                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1025 
1026                   LOAD_GOT_rbp
1027                   "movq   " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
1028                   "movq   " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
1029                   "movq   " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
1030                   RESTORE_rbp
1031 
1032                   "pand      %%mm7, %%mm0     \n\t"
1033                   "pand      %%mm7, %%mm1     \n\t"
1034                   "pand      %%mm7, %%mm2     \n\t"
1035 
1036                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1037                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1038                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1039 
1040 // preload        "movl      len, %%ecx       \n\t" // load length of line
1041 // preload        "movl      srcptr, %3       \n\t" // load source
1042 // preload        "movl      dstptr, %4       \n\t" // load dest
1043 
1044                   "cmpl      $0, %%ecx        \n\t"
1045                   "jz        mainloop24end    \n\t"
1046 
1047                 "mainloop24:                  \n\t"
1048                   "movq      (%3), %%mm4      \n\t"
1049                   "pand      %%mm0, %%mm4     \n\t"
1050                   "movq      %%mm0, %%mm6     \n\t"
1051                   "movq      (%4), %%mm7      \n\t"
1052                   "pandn     %%mm7, %%mm6     \n\t"
1053                   "por       %%mm6, %%mm4     \n\t"
1054                   "movq      %%mm4, (%4)      \n\t"
1055 
1056                   "movq      8(%3), %%mm5     \n\t"
1057                   "pand      %%mm1, %%mm5     \n\t"
1058                   "movq      %%mm1, %%mm7     \n\t"
1059                   "movq      8(%4), %%mm6     \n\t"
1060                   "pandn     %%mm6, %%mm7     \n\t"
1061                   "por       %%mm7, %%mm5     \n\t"
1062                   "movq      %%mm5, 8(%4)     \n\t"
1063 
1064                   "movq      16(%3), %%mm6    \n\t"
1065                   "pand      %%mm2, %%mm6     \n\t"
1066                   "movq      %%mm2, %%mm4     \n\t"
1067                   "movq      16(%4), %%mm7    \n\t"
1068                   "pandn     %%mm7, %%mm4     \n\t"
1069                   "por       %%mm4, %%mm6     \n\t"
1070                   "movq      %%mm6, 16(%4)    \n\t"
1071 
1072                   "add       $24, %3          \n\t" // inc by 24 bytes processed
1073                   "add       $24, %4          \n\t"
1074                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1075 
1076                   "ja        mainloop24       \n\t"
1077 
1078                 "mainloop24end:               \n\t"
1079 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1080                   "movl      %%eax, %%ecx     \n\t"
1081                   "cmpl      $0, %%ecx        \n\t"
1082                   "jz        end24            \n\t"
1083 // preload        "movl      mask, %%edx      \n\t"
1084                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1085 
1086                 "secondloop24:                \n\t"
1087                   "sall      %%edx            \n\t" // move high bit to CF
1088                   "jnc       skip24           \n\t" // if CF = 0
1089                   "movw      (%3), %%ax       \n\t"
1090                   "movw      %%ax, (%4)       \n\t"
1091                   "xorl      %%eax, %%eax     \n\t"
1092                   "movb      2(%3), %%al      \n\t"
1093                   "movb      %%al, 2(%4)      \n\t"
1094 
1095                 "skip24:                      \n\t"
1096                   "add       $3, %3           \n\t"
1097                   "add       $3, %4           \n\t"
1098                   "decl      %%ecx            \n\t"
1099                   "jnz       secondloop24     \n\t"
1100 
1101                 "end24:                       \n\t"
1102                   "EMMS                       \n\t" // DONE
1103 
1104                   : "=a" (dummy_value_a),           // output regs (dummy)
1105                     "=d" (dummy_value_d),
1106                     "=c" (dummy_value_c),
1107                     "=S" (dummy_value_S),
1108                     "=D" (dummy_value_D)
1109 
1110                   : "0" (diff),        // eax       // input regs
1111                     "1" (mask),        // edx
1112                     "2" (len),         // ecx
1113 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1114                     "3" (srcptr),      // esi/rsi
1115                     "4" (dstptr)       // edi/rdi
1116 
1117 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1118                   : "%mm0", "%mm1", "%mm2"          // clobber list
1119                   , "%mm4", "%mm5", "%mm6", "%mm7"
1120 #endif
1121                );
1122             }
1123             else /* not _mmx_supported - use modified C routine */
1124             {
1125                register png_uint_32 i;
1126                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1127                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1128                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1129                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1130                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1131                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1132                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1133                int diff = (int) (png_ptr->width & 7); /* amount lost */
1134                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1135 
1136                srcptr = png_ptr->row_buf + 1 + initial_val;
1137                dstptr = row + initial_val;
1138 
1139                for (i = initial_val; i < final_val; i += stride)
1140                {
1141                   png_memcpy(dstptr, srcptr, rep_bytes);
1142                   srcptr += stride;
1143                   dstptr += stride;
1144                }
1145                if (diff)  /* number of leftover pixels:  3 for pngtest */
1146                {
1147                   final_val += diff*BPP3;
1148                   for (; i < final_val; i += stride)
1149                   {
1150                      if (rep_bytes > (int)(final_val-i))
1151                         rep_bytes = (int)(final_val-i);
1152                      png_memcpy(dstptr, srcptr, rep_bytes);
1153                      srcptr += stride;
1154                      dstptr += stride;
1155                   }
1156                }
1157             } /* end of else (_mmx_supported) */
1158 
1159             break;
1160          }       /* end 24 bpp */
1161 
1162          // formerly claimed to be most common case (combining 32-bit RGBA),
1163          // but almost certainly less common than 24-bit RGB case
1164          case 32:       /* png_ptr->row_info.pixel_depth */
1165          {
1166             png_bytep srcptr;
1167             png_bytep dstptr;
1168 
1169 #if !defined(PNG_1_0_X)
1170             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1171 #else
1172             if (_mmx_supported)
1173 #endif
1174             {
1175                png_uint_32 len;
1176                int diff;
1177 
1178                srcptr = png_ptr->row_buf + 1;
1179                dstptr = row;
1180                len  = png_ptr->width & ~7;          // reduce to multiple of 8
1181                diff = (int) (png_ptr->width & 7);   // amount lost
1182 
1183                __asm__ __volatile__ (
1184                   "not       %%edx            \n\t" // mask => unmask
1185                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
1186                   "not       %%edx            \n\t" // unmask => mask for later
1187                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1188                   "punpcklbw %%mm7, %%mm7     \n\t"
1189                   "punpcklwd %%mm7, %%mm7     \n\t"
1190                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1191 
1192                   LOAD_GOT_rbp
1193                   "movq   " MASK32_0 ", %%mm0 \n\t" // _mask32_0
1194                   "movq   " MASK32_1 ", %%mm1 \n\t" // _mask32_1
1195                   "movq   " MASK32_2 ", %%mm2 \n\t" // _mask32_2
1196                   "movq   " MASK32_3 ", %%mm3 \n\t" // _mask32_3
1197                   RESTORE_rbp
1198 
1199                   "pand      %%mm7, %%mm0     \n\t"
1200                   "pand      %%mm7, %%mm1     \n\t"
1201                   "pand      %%mm7, %%mm2     \n\t"
1202                   "pand      %%mm7, %%mm3     \n\t"
1203 
1204                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1205                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1206                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1207                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1208 
1209 // preload        "movl      len, %%ecx       \n\t" // load length of line
1210 // preload        "movl      srcptr, %3       \n\t" // load source
1211 // preload        "movl      dstptr, %4       \n\t" // load dest
1212 
1213                   "cmpl      $0, %%ecx        \n\t" // lcr
1214                   "jz        mainloop32end    \n\t"
1215 
1216                 "mainloop32:                  \n\t"
1217                   "movq      (%3), %%mm4      \n\t"
1218                   "pand      %%mm0, %%mm4     \n\t"
1219                   "movq      %%mm0, %%mm6     \n\t"
1220                   "movq      (%4), %%mm7      \n\t"
1221                   "pandn     %%mm7, %%mm6     \n\t"
1222                   "por       %%mm6, %%mm4     \n\t"
1223                   "movq      %%mm4, (%4)      \n\t"
1224 
1225                   "movq      8(%3), %%mm5     \n\t"
1226                   "pand      %%mm1, %%mm5     \n\t"
1227                   "movq      %%mm1, %%mm7     \n\t"
1228                   "movq      8(%4), %%mm6     \n\t"
1229                   "pandn     %%mm6, %%mm7     \n\t"
1230                   "por       %%mm7, %%mm5     \n\t"
1231                   "movq      %%mm5, 8(%4)     \n\t"
1232 
1233                   "movq      16(%3), %%mm6    \n\t"
1234                   "pand      %%mm2, %%mm6     \n\t"
1235                   "movq      %%mm2, %%mm4     \n\t"
1236                   "movq      16(%4), %%mm7    \n\t"
1237                   "pandn     %%mm7, %%mm4     \n\t"
1238                   "por       %%mm4, %%mm6     \n\t"
1239                   "movq      %%mm6, 16(%4)    \n\t"
1240 
1241                   "movq      24(%3), %%mm7    \n\t"
1242                   "pand      %%mm3, %%mm7     \n\t"
1243                   "movq      %%mm3, %%mm5     \n\t"
1244                   "movq      24(%4), %%mm4    \n\t"
1245                   "pandn     %%mm4, %%mm5     \n\t"
1246                   "por       %%mm5, %%mm7     \n\t"
1247                   "movq      %%mm7, 24(%4)    \n\t"
1248 
1249                   "add       $32, %3          \n\t" // inc by 32 bytes processed
1250                   "add       $32, %4          \n\t"
1251                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1252                   "ja        mainloop32       \n\t"
1253 
1254                 "mainloop32end:               \n\t"
1255 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1256                   "movl      %%eax, %%ecx     \n\t"
1257                   "cmpl      $0, %%ecx        \n\t"
1258                   "jz        end32            \n\t"
1259 // preload        "movl      mask, %%edx      \n\t"
1260                   "sall      $24, %%edx       \n\t" // low byte => high byte
1261 
1262                 "secondloop32:                \n\t"
1263                   "sall      %%edx            \n\t" // move high bit to CF
1264                   "jnc       skip32           \n\t" // if CF = 0
1265                   "movl      (%3), %%eax      \n\t"
1266                   "movl      %%eax, (%4)      \n\t"
1267 
1268                 "skip32:                      \n\t"
1269                   "add       $4, %3           \n\t"
1270                   "add       $4, %4           \n\t"
1271                   "decl      %%ecx            \n\t"
1272                   "jnz       secondloop32     \n\t"
1273 
1274                 "end32:                       \n\t"
1275                   "EMMS                       \n\t" // DONE
1276 
1277                   : "=a" (dummy_value_a),           // output regs (dummy)
1278                     "=d" (dummy_value_d),
1279                     "=c" (dummy_value_c),
1280                     "=S" (dummy_value_S),
1281                     "=D" (dummy_value_D)
1282 
1283                   : "0" (diff),        // eax       // input regs
1284                     "1" (mask),        // edx
1285                     "2" (len),         // ecx
1286 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1287                     "3" (srcptr),      // esi/rsi
1288                     "4" (dstptr)       // edi/rdi
1289 
1290 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1291                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1292                   , "%mm4", "%mm5", "%mm6", "%mm7"
1293 #endif
1294                );
1295             }
1296             else /* not _mmx_supported - use modified C routine */
1297             {
1298                register png_uint_32 i;
1299                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1300                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1301                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1302                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1303                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1304                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1305                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1306                int diff = (int) (png_ptr->width & 7); /* amount lost */
1307                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1308 
1309                srcptr = png_ptr->row_buf + 1 + initial_val;
1310                dstptr = row + initial_val;
1311 
1312                for (i = initial_val; i < final_val; i += stride)
1313                {
1314                   png_memcpy(dstptr, srcptr, rep_bytes);
1315                   srcptr += stride;
1316                   dstptr += stride;
1317                }
1318                if (diff)  /* number of leftover pixels:  3 for pngtest */
1319                {
1320                   final_val += diff*BPP4;
1321                   for (; i < final_val; i += stride)
1322                   {
1323                      if (rep_bytes > (int)(final_val-i))
1324                         rep_bytes = (int)(final_val-i);
1325                      png_memcpy(dstptr, srcptr, rep_bytes);
1326                      srcptr += stride;
1327                      dstptr += stride;
1328                   }
1329                }
1330             } /* end of else (_mmx_supported) */
1331 
1332             break;
1333          }       /* end 32 bpp */
1334 
1335          case 8:        /* png_ptr->row_info.pixel_depth */
1336          {
1337             png_bytep srcptr;
1338             png_bytep dstptr;
1339 
1340 #if !defined(PNG_1_0_X)
1341             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1342 #else
1343             if (_mmx_supported)
1344 #endif
1345             {
1346                png_uint_32 len;
1347                int diff;
1348 
1349                srcptr = png_ptr->row_buf + 1;
1350                dstptr = row;
1351                len  = png_ptr->width & ~7;          // reduce to multiple of 8
1352                diff = (int) (png_ptr->width & 7);   // amount lost
1353 
1354                __asm__ __volatile__ (
1355                   "not       %%edx            \n\t" // mask => unmask
1356                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
1357                   "not       %%edx            \n\t" // unmask => mask for later
1358                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1359                   "punpcklbw %%mm7, %%mm7     \n\t"
1360                   "punpcklwd %%mm7, %%mm7     \n\t"
1361                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1362 
1363                   LOAD_GOT_rbp
1364                   "movq   " MASK8_0 ", %%mm0  \n\t" // _mask8_0 -> mm0
1365                   RESTORE_rbp
1366 
1367                   "pand      %%mm7, %%mm0     \n\t" // nonzero if keep byte
1368                   "pcmpeqb   %%mm6, %%mm0     \n\t" // zeros->1s, v versa
1369 
1370 // preload        "movl      len, %%ecx       \n\t" // load length of line
1371 // preload        "movl      srcptr, %3       \n\t" // load source
1372 // preload        "movl      dstptr, %4       \n\t" // load dest
1373 
1374                   "cmpl      $0, %%ecx        \n\t" // len == 0 ?
1375                   "je        mainloop8end     \n\t"
1376 
1377                 "mainloop8:                   \n\t"
1378                   "movq      (%3), %%mm4      \n\t" // *srcptr
1379                   "pand      %%mm0, %%mm4     \n\t"
1380                   "movq      %%mm0, %%mm6     \n\t"
1381                   "pandn     (%4), %%mm6      \n\t" // *dstptr
1382                   "por       %%mm6, %%mm4     \n\t"
1383                   "movq      %%mm4, (%4)      \n\t"
1384                   "add       $8, %3           \n\t" // inc by 8 bytes processed
1385                   "add       $8, %4           \n\t"
1386                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1387                   "ja        mainloop8        \n\t"
1388 
1389                 "mainloop8end:                \n\t"
1390 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1391                   "movl      %%eax, %%ecx     \n\t"
1392                   "cmpl      $0, %%ecx        \n\t"
1393                   "jz        end8             \n\t"
1394 // preload        "movl      mask, %%edx      \n\t"
1395                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1396 
1397                 "secondloop8:                 \n\t"
1398                   "sall      %%edx            \n\t" // move high bit to CF
1399                   "jnc       skip8            \n\t" // if CF = 0
1400                   "movb      (%3), %%al       \n\t"
1401                   "movb      %%al, (%4)       \n\t"
1402 
1403                 "skip8:                       \n\t"
1404                   "inc       %3               \n\t"
1405                   "inc       %4               \n\t"
1406                   "decl      %%ecx            \n\t"
1407                   "jnz       secondloop8      \n\t"
1408 
1409                 "end8:                        \n\t"
1410                   "EMMS                       \n\t" // DONE
1411 
1412                   : "=a" (dummy_value_a),           // output regs (dummy)
1413                     "=d" (dummy_value_d),
1414                     "=c" (dummy_value_c),
1415                     "=S" (dummy_value_S),
1416                     "=D" (dummy_value_D)
1417 
1418                   : "0" (diff),        // eax       // input regs
1419                     "1" (mask),        // edx
1420                     "2" (len),         // ecx
1421 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1422                     "3" (srcptr),      // esi/rsi
1423                     "4" (dstptr)       // edi/rdi
1424 
1425 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1426                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
1427 #endif
1428                );
1429             }
1430             else /* not _mmx_supported - use modified C routine */
1431             {
1432                register png_uint_32 i;
1433                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
1434                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1435                register int stride = png_pass_inc[png_ptr->pass];
1436                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1437                register int rep_bytes = png_pass_width[png_ptr->pass];
1438                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1439                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1440                int diff = (int) (png_ptr->width & 7); /* amount lost */
1441                register png_uint_32 final_val = len;  /* GRR bugfix */
1442 
1443                srcptr = png_ptr->row_buf + 1 + initial_val;
1444                dstptr = row + initial_val;
1445 
1446                for (i = initial_val; i < final_val; i += stride)
1447                {
1448                   png_memcpy(dstptr, srcptr, rep_bytes);
1449                   srcptr += stride;
1450                   dstptr += stride;
1451                }
1452                if (diff)  /* number of leftover pixels:  3 for pngtest */
1453                {
1454                   final_val += diff /* *BPP1 */ ;
1455                   for (; i < final_val; i += stride)
1456                   {
1457                      if (rep_bytes > (int)(final_val-i))
1458                         rep_bytes = (int)(final_val-i);
1459                      png_memcpy(dstptr, srcptr, rep_bytes);
1460                      srcptr += stride;
1461                      dstptr += stride;
1462                   }
1463                }
1464 
1465             } /* end of else (_mmx_supported) */
1466 
1467             break;
1468          }       /* end 8 bpp */
1469 
1470          case 1:        /* png_ptr->row_info.pixel_depth */
1471          {
1472             png_bytep sp;
1473             png_bytep dp;
1474             int s_inc, s_start, s_end;
1475             int m;
1476             int shift;
1477             png_uint_32 i;
1478 
1479             sp = png_ptr->row_buf + 1;
1480             dp = row;
1481             m = 0x80;
1482 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1483             if (png_ptr->transformations & PNG_PACKSWAP)
1484             {
1485                s_start = 0;
1486                s_end = 7;
1487                s_inc = 1;
1488             }
1489             else
1490 #endif
1491             {
1492                s_start = 7;
1493                s_end = 0;
1494                s_inc = -1;
1495             }
1496 
1497             shift = s_start;
1498 
1499             for (i = 0; i < png_ptr->width; i++)
1500             {
1501                if (m & mask)
1502                {
1503                   int value;
1504 
1505                   value = (*sp >> shift) & 0x1;
1506                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
1507                   *dp |= (png_byte)(value << shift);
1508                }
1509 
1510                if (shift == s_end)
1511                {
1512                   shift = s_start;
1513                   sp++;
1514                   dp++;
1515                }
1516                else
1517                   shift += s_inc;
1518 
1519                if (m == 1)
1520                   m = 0x80;
1521                else
1522                   m >>= 1;
1523             }
1524             break;
1525          }       /* end 1 bpp */
1526 
1527          case 2:        /* png_ptr->row_info.pixel_depth */
1528          {
1529             png_bytep sp;
1530             png_bytep dp;
1531             int s_start, s_end, s_inc;
1532             int m;
1533             int shift;
1534             png_uint_32 i;
1535             int value;
1536 
1537             sp = png_ptr->row_buf + 1;
1538             dp = row;
1539             m = 0x80;
1540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541             if (png_ptr->transformations & PNG_PACKSWAP)
1542             {
1543                s_start = 0;
1544                s_end = 6;
1545                s_inc = 2;
1546             }
1547             else
1548 #endif
1549             {
1550                s_start = 6;
1551                s_end = 0;
1552                s_inc = -2;
1553             }
1554 
1555             shift = s_start;
1556 
1557             for (i = 0; i < png_ptr->width; i++)
1558             {
1559                if (m & mask)
1560                {
1561                   value = (*sp >> shift) & 0x3;
1562                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
1563                   *dp |= (png_byte)(value << shift);
1564                }
1565 
1566                if (shift == s_end)
1567                {
1568                   shift = s_start;
1569                   sp++;
1570                   dp++;
1571                }
1572                else
1573                   shift += s_inc;
1574                if (m == 1)
1575                   m = 0x80;
1576                else
1577                   m >>= 1;
1578             }
1579             break;
1580          }       /* end 2 bpp */
1581 
1582          case 4:        /* png_ptr->row_info.pixel_depth */
1583          {
1584             png_bytep sp;
1585             png_bytep dp;
1586             int s_start, s_end, s_inc;
1587             int m;
1588             int shift;
1589             png_uint_32 i;
1590             int value;
1591 
1592             sp = png_ptr->row_buf + 1;
1593             dp = row;
1594             m = 0x80;
1595 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1596             if (png_ptr->transformations & PNG_PACKSWAP)
1597             {
1598                s_start = 0;
1599                s_end = 4;
1600                s_inc = 4;
1601             }
1602             else
1603 #endif
1604             {
1605                s_start = 4;
1606                s_end = 0;
1607                s_inc = -4;
1608             }
1609 
1610             shift = s_start;
1611 
1612             for (i = 0; i < png_ptr->width; i++)
1613             {
1614                if (m & mask)
1615                {
1616                   value = (*sp >> shift) & 0xf;
1617                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
1618                   *dp |= (png_byte)(value << shift);
1619                }
1620 
1621                if (shift == s_end)
1622                {
1623                   shift = s_start;
1624                   sp++;
1625                   dp++;
1626                }
1627                else
1628                   shift += s_inc;
1629                if (m == 1)
1630                   m = 0x80;
1631                else
1632                   m >>= 1;
1633             }
1634             break;
1635          }       /* end 4 bpp */
1636 
1637          case 16:       /* png_ptr->row_info.pixel_depth */
1638          {
1639             png_bytep srcptr;
1640             png_bytep dstptr;
1641 
1642 #if !defined(PNG_1_0_X)
1643             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1644 #else
1645             if (_mmx_supported)
1646 #endif
1647             {
1648                png_uint_32 len;
1649                int diff;
1650 
1651                srcptr = png_ptr->row_buf + 1;
1652                dstptr = row;
1653                len  = png_ptr->width & ~7;          // reduce to multiple of 8
1654                diff = (int) (png_ptr->width & 7);   // amount lost
1655 
1656                __asm__ __volatile__ (
1657                   "not       %%edx            \n\t" // mask => unmask
1658                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
1659                   "not       %%edx            \n\t" // unmask => mask for later
1660                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1661                   "punpcklbw %%mm7, %%mm7     \n\t"
1662                   "punpcklwd %%mm7, %%mm7     \n\t"
1663                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1664 
1665                   LOAD_GOT_rbp
1666                   "movq   " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
1667                   "movq   " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
1668                   RESTORE_rbp
1669 
1670                   "pand      %%mm7, %%mm0     \n\t"
1671                   "pand      %%mm7, %%mm1     \n\t"
1672 
1673                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1674                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1675 
1676 // preload        "movl      len, %%ecx       \n\t" // load length of line
1677 // preload        "movl      srcptr, %3       \n\t" // load source
1678 // preload        "movl      dstptr, %4       \n\t" // load dest
1679 
1680                   "cmpl      $0, %%ecx        \n\t"
1681                   "jz        mainloop16end    \n\t"
1682 
1683                 "mainloop16:                  \n\t"
1684                   "movq      (%3), %%mm4      \n\t"
1685                   "pand      %%mm0, %%mm4     \n\t"
1686                   "movq      %%mm0, %%mm6     \n\t"
1687                   "movq      (%4), %%mm7      \n\t"
1688                   "pandn     %%mm7, %%mm6     \n\t"
1689                   "por       %%mm6, %%mm4     \n\t"
1690                   "movq      %%mm4, (%4)      \n\t"
1691 
1692                   "movq      8(%3), %%mm5     \n\t"
1693                   "pand      %%mm1, %%mm5     \n\t"
1694                   "movq      %%mm1, %%mm7     \n\t"
1695                   "movq      8(%4), %%mm6     \n\t"
1696                   "pandn     %%mm6, %%mm7     \n\t"
1697                   "por       %%mm7, %%mm5     \n\t"
1698                   "movq      %%mm5, 8(%4)     \n\t"
1699 
1700                   "add       $16, %3          \n\t" // inc by 16 bytes processed
1701                   "add       $16, %4          \n\t"
1702                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1703                   "ja        mainloop16       \n\t"
1704 
1705                 "mainloop16end:               \n\t"
1706 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1707                   "movl      %%eax, %%ecx     \n\t"
1708                   "cmpl      $0, %%ecx        \n\t"
1709                   "jz        end16            \n\t"
1710 // preload        "movl      mask, %%edx      \n\t"
1711                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1712 
1713                 "secondloop16:                \n\t"
1714                   "sall      %%edx            \n\t" // move high bit to CF
1715                   "jnc       skip16           \n\t" // if CF = 0
1716                   "movw      (%3), %%ax       \n\t"
1717                   "movw      %%ax, (%4)       \n\t"
1718 
1719                 "skip16:                      \n\t"
1720                   "add       $2, %3           \n\t"
1721                   "add       $2, %4           \n\t"
1722                   "decl      %%ecx            \n\t"
1723                   "jnz       secondloop16     \n\t"
1724 
1725                 "end16:                       \n\t"
1726                   "EMMS                       \n\t" // DONE
1727 
1728                   : "=a" (dummy_value_a),           // output regs (dummy)
1729                     "=d" (dummy_value_d),
1730                     "=c" (dummy_value_c),
1731                     "=S" (dummy_value_S),
1732                     "=D" (dummy_value_D)
1733 
1734                   : "0" (diff),        // eax       // input regs
1735                     "1" (mask),        // edx
1736                     "2" (len),         // ecx
1737 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1738                     "3" (srcptr),      // esi/rsi
1739                     "4" (dstptr)       // edi/rdi
1740 
1741 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1742                   : "%mm0", "%mm1", "%mm4"          // clobber list
1743                   , "%mm5", "%mm6", "%mm7"
1744 #endif
1745                );
1746             }
1747             else /* not _mmx_supported - use modified C routine */
1748             {
1749                register png_uint_32 i;
1750                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
1751                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1752                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
1753                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1754                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
1755                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1756                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1757                int diff = (int) (png_ptr->width & 7); /* amount lost */
1758                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
1759 
1760                srcptr = png_ptr->row_buf + 1 + initial_val;
1761                dstptr = row + initial_val;
1762 
1763                for (i = initial_val; i < final_val; i += stride)
1764                {
1765                   png_memcpy(dstptr, srcptr, rep_bytes);
1766                   srcptr += stride;
1767                   dstptr += stride;
1768                }
1769                if (diff)  /* number of leftover pixels:  3 for pngtest */
1770                {
1771                   final_val += diff*BPP2;
1772                   for (; i < final_val; i += stride)
1773                   {
1774                      if (rep_bytes > (int)(final_val-i))
1775                         rep_bytes = (int)(final_val-i);
1776                      png_memcpy(dstptr, srcptr, rep_bytes);
1777                      srcptr += stride;
1778                      dstptr += stride;
1779                   }
1780                }
1781             } /* end of else (_mmx_supported) */
1782 
1783             break;
1784          }       /* end 16 bpp */
1785 
1786          case 48:       /* png_ptr->row_info.pixel_depth */
1787          {
1788             png_bytep srcptr;
1789             png_bytep dstptr;
1790 
1791 #if !defined(PNG_1_0_X)
1792             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1793 #else
1794             if (_mmx_supported)
1795 #endif
1796             {
1797                png_uint_32 len;
1798                int diff;
1799 
1800                srcptr = png_ptr->row_buf + 1;
1801                dstptr = row;
1802                len  = png_ptr->width & ~7;          // reduce to multiple of 8
1803                diff = (int) (png_ptr->width & 7);   // amount lost
1804 
1805                __asm__ __volatile__ (
1806                   "not       %%edx            \n\t" // mask => unmask
1807                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
1808                   "not       %%edx            \n\t" // unmask => mask for later
1809                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1810                   "punpcklbw %%mm7, %%mm7     \n\t"
1811                   "punpcklwd %%mm7, %%mm7     \n\t"
1812                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1813 
1814                   LOAD_GOT_rbp
1815                   "movq   " MASK48_0 ", %%mm0 \n\t" // _mask48_0 -> mm0
1816                   "movq   " MASK48_1 ", %%mm1 \n\t" // _mask48_1 -> mm1
1817                   "movq   " MASK48_2 ", %%mm2 \n\t" // _mask48_2 -> mm2
1818                   "movq   " MASK48_3 ", %%mm3 \n\t" // _mask48_3 -> mm3
1819                   "movq   " MASK48_4 ", %%mm4 \n\t" // _mask48_4 -> mm4
1820                   "movq   " MASK48_5 ", %%mm5 \n\t" // _mask48_5 -> mm5
1821                   RESTORE_rbp
1822 
1823                   "pand      %%mm7, %%mm0     \n\t"
1824                   "pand      %%mm7, %%mm1     \n\t"
1825                   "pand      %%mm7, %%mm2     \n\t"
1826                   "pand      %%mm7, %%mm3     \n\t"
1827                   "pand      %%mm7, %%mm4     \n\t"
1828                   "pand      %%mm7, %%mm5     \n\t"
1829 
1830                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1831                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1832                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1833                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1834                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1835                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1836 
1837 // preload        "movl      len, %%ecx       \n\t" // load length of line
1838 // preload        "movl      srcptr, %3       \n\t" // load source
1839 // preload        "movl      dstptr, %4       \n\t" // load dest
1840 
1841                   "cmpl      $0, %%ecx        \n\t"
1842                   "jz        mainloop48end    \n\t"
1843 
1844                 "mainloop48:                  \n\t"
1845                   "movq      (%3), %%mm7      \n\t"
1846                   "pand      %%mm0, %%mm7     \n\t"
1847                   "movq      %%mm0, %%mm6     \n\t"
1848                   "pandn     (%4), %%mm6      \n\t"
1849                   "por       %%mm6, %%mm7     \n\t"
1850                   "movq      %%mm7, (%4)      \n\t"
1851 
1852                   "movq      8(%3), %%mm6     \n\t"
1853                   "pand      %%mm1, %%mm6     \n\t"
1854                   "movq      %%mm1, %%mm7     \n\t"
1855                   "pandn     8(%4), %%mm7     \n\t"
1856                   "por       %%mm7, %%mm6     \n\t"
1857                   "movq      %%mm6, 8(%4)     \n\t"
1858 
1859                   "movq      16(%3), %%mm6    \n\t"
1860                   "pand      %%mm2, %%mm6     \n\t"
1861                   "movq      %%mm2, %%mm7     \n\t"
1862                   "pandn     16(%4), %%mm7    \n\t"
1863                   "por       %%mm7, %%mm6     \n\t"
1864                   "movq      %%mm6, 16(%4)    \n\t"
1865 
1866                   "movq      24(%3), %%mm7    \n\t"
1867                   "pand      %%mm3, %%mm7     \n\t"
1868                   "movq      %%mm3, %%mm6     \n\t"
1869                   "pandn     24(%4), %%mm6    \n\t"
1870                   "por       %%mm6, %%mm7     \n\t"
1871                   "movq      %%mm7, 24(%4)    \n\t"
1872 
1873                   "movq      32(%3), %%mm6    \n\t"
1874                   "pand      %%mm4, %%mm6     \n\t"
1875                   "movq      %%mm4, %%mm7     \n\t"
1876                   "pandn     32(%4), %%mm7    \n\t"
1877                   "por       %%mm7, %%mm6     \n\t"
1878                   "movq      %%mm6, 32(%4)    \n\t"
1879 
1880                   "movq      40(%3), %%mm7    \n\t"
1881                   "pand      %%mm5, %%mm7     \n\t"
1882                   "movq      %%mm5, %%mm6     \n\t"
1883                   "pandn     40(%4), %%mm6    \n\t"
1884                   "por       %%mm6, %%mm7     \n\t"
1885                   "movq      %%mm7, 40(%4)    \n\t"
1886 
1887                   "add       $48, %3          \n\t" // inc by 48 bytes processed
1888                   "add       $48, %4          \n\t"
1889                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1890 
1891                   "ja        mainloop48       \n\t"
1892 
1893                 "mainloop48end:               \n\t"
1894 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1895                   "movl      %%eax, %%ecx     \n\t"
1896                   "cmpl      $0, %%ecx        \n\t"
1897                   "jz        end48            \n\t"
1898 // preload        "movl      mask, %%edx      \n\t"
1899                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1900 
1901                 "secondloop48:                \n\t"
1902                   "sall      %%edx            \n\t" // move high bit to CF
1903                   "jnc       skip48           \n\t" // if CF = 0
1904                   "movl      (%3), %%eax      \n\t"
1905                   "movl      %%eax, (%4)      \n\t"
1906                   "movw      4(%3), %%ax      \n\t" // GR-P bugfix 20070717
1907                   "movw      %%ax, 4(%4)      \n\t" // GR-P bugfix 20070717
1908 
1909                 "skip48:                      \n\t"
1910                   "add       $6, %3           \n\t" // GR-P bugfix 20070717
1911                   "add       $6, %4           \n\t" // GR-P bugfix 20070717
1912                   "decl      %%ecx            \n\t"
1913                   "jnz       secondloop48     \n\t"
1914 
1915                 "end48:                       \n\t"
1916                   "EMMS                       \n\t" // DONE
1917 
1918                   : "=a" (dummy_value_a),           // output regs (dummy)
1919                     "=d" (dummy_value_d),
1920                     "=c" (dummy_value_c),
1921                     "=S" (dummy_value_S),
1922                     "=D" (dummy_value_D)
1923 
1924                   : "0" (diff),        // eax       // input regs
1925                     "1" (mask),        // edx
1926                     "2" (len),         // ecx
1927 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1928                     "3" (srcptr),      // esi/rsi
1929                     "4" (dstptr)       // edi/rdi
1930 
1931 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1932                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1933                   , "%mm4", "%mm5", "%mm6", "%mm7"
1934 #endif
1935                );
1936             }
1937             else /* not _mmx_supported - use modified C routine */
1938             {
1939                register png_uint_32 i;
1940                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1941                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1942                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1943                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1944                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1945                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1946                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1947                int diff = (int) (png_ptr->width & 7); /* amount lost */
1948                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1949 
1950                srcptr = png_ptr->row_buf + 1 + initial_val;
1951                dstptr = row + initial_val;
1952 
1953                for (i = initial_val; i < final_val; i += stride)
1954                {
1955                   png_memcpy(dstptr, srcptr, rep_bytes);
1956                   srcptr += stride;
1957                   dstptr += stride;
1958                }
1959                if (diff)  /* number of leftover pixels:  3 for pngtest */
1960                {
1961                   final_val += diff*BPP6;
1962                   for (; i < final_val; i += stride)
1963                   {
1964                      if (rep_bytes > (int)(final_val-i))
1965                         rep_bytes = (int)(final_val-i);
1966                      png_memcpy(dstptr, srcptr, rep_bytes);
1967                      srcptr += stride;
1968                      dstptr += stride;
1969                   }
1970                }
1971             } /* end of else (_mmx_supported) */
1972 
1973             break;
1974          }       /* end 48 bpp */
1975 
1976          case 64:       /* png_ptr->row_info.pixel_depth */
1977          {
1978             png_bytep srcptr;
1979             png_bytep dstptr;
1980             register png_uint_32 i;
1981             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1982               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1983             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1984               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1985             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1986               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1987             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1988             int diff = (int) (png_ptr->width & 7); /* amount lost */
1989             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1990 
1991             srcptr = png_ptr->row_buf + 1 + initial_val;
1992             dstptr = row + initial_val;
1993 
1994             for (i = initial_val; i < final_val; i += stride)
1995             {
1996                png_memcpy(dstptr, srcptr, rep_bytes);
1997                srcptr += stride;
1998                dstptr += stride;
1999             }
2000             if (diff)  /* number of leftover pixels:  3 for pngtest */
2001             {
2002                final_val += diff*BPP8;
2003                for (; i < final_val; i += stride)
2004                {
2005                   if (rep_bytes > (int)(final_val-i))
2006                      rep_bytes = (int)(final_val-i);
2007                   png_memcpy(dstptr, srcptr, rep_bytes);
2008                   srcptr += stride;
2009                   dstptr += stride;
2010                }
2011             }
2012 
2013             break;
2014          }       /* end 64 bpp */
2015 
2016          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
2017          {
2018             // ERROR:  SHOULD NEVER BE REACHED
2019 #if defined(PNG_DEBUG)
2020             png_debug(1, "Internal libpng logic error (GCC "
2021               "png_combine_row() pixel_depth)\n");
2022 #endif
2023             break;
2024          }
2025       } /* end switch (png_ptr->row_info.pixel_depth) */
2026 
2027    } /* end if (non-trivial mask) */
2028 
2029 } /* end png_combine_row() */
2030 
2031 #endif /* PNG_HAVE_MMX_COMBINE_ROW */
2032 
2033 
2034 
2035 
2036 /*===========================================================================*/
2037 /*                                                                           */
2038 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
2039 /*                                                                           */
2040 /*===========================================================================*/
2041 
2042 #if defined(PNG_READ_INTERLACING_SUPPORTED)
2043 #if defined(PNG_HAVE_MMX_READ_INTERLACE)
2044 
2045 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
2046  * has taken place.  [GRR: what other steps come before and/or after?]
2047  */
2048 
2049 void /* PRIVATE */
png_do_read_interlace(png_structp png_ptr)2050 png_do_read_interlace(png_structp png_ptr)
2051 {
2052    png_row_infop row_info = &(png_ptr->row_info);
2053    png_bytep row = png_ptr->row_buf + 1;
2054    int pass = png_ptr->pass;
2055 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
2056    png_uint_32 transformations = png_ptr->transformations;
2057 #endif
2058 
2059    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
2060 
2061    if (_mmx_supported == 2) {
2062 #if !defined(PNG_1_0_X)
2063        /* this should have happened in png_init_mmx_flags() already */
2064        png_warning(png_ptr, "asm_flags may not have been initialized");
2065 #endif
2066        png_mmx_support();
2067    }
2068 
2069    if (row != NULL && row_info != NULL)
2070    {
2071       png_uint_32 final_width;
2072 
2073       final_width = row_info->width * png_pass_inc[pass];
2074 
2075       switch (row_info->pixel_depth)
2076       {
2077          case 1:
2078          {
2079             png_bytep sp, dp;
2080             int sshift, dshift;
2081             int s_start, s_end, s_inc;
2082             png_byte v;
2083             png_uint_32 i;
2084             int j;
2085 
2086             sp = row + (png_size_t)((row_info->width - 1) >> 3);
2087             dp = row + (png_size_t)((final_width - 1) >> 3);
2088 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
2089             if (transformations & PNG_PACKSWAP)
2090             {
2091                sshift = (int)((row_info->width + 7) & 7);
2092                dshift = (int)((final_width + 7) & 7);
2093                s_start = 7;
2094                s_end = 0;
2095                s_inc = -1;
2096             }
2097             else
2098 #endif
2099             {
2100                sshift = 7 - (int)((row_info->width + 7) & 7);
2101                dshift = 7 - (int)((final_width + 7) & 7);
2102                s_start = 0;
2103                s_end = 7;
2104                s_inc = 1;
2105             }
2106 
2107             for (i = row_info->width; i; i--)
2108             {
2109                v = (png_byte)((*sp >> sshift) & 0x1);
2110                for (j = 0; j < png_pass_inc[pass]; j++)
2111                {
2112                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
2113                   *dp |= (png_byte)(v << dshift);
2114                   if (dshift == s_end)
2115                   {
2116                      dshift = s_start;
2117                      dp--;
2118                   }
2119                   else
2120                      dshift += s_inc;
2121                }
2122                if (sshift == s_end)
2123                {
2124                   sshift = s_start;
2125                   sp--;
2126                }
2127                else
2128                   sshift += s_inc;
2129             }
2130             break;
2131          }
2132 
2133          case 2:
2134          {
2135             png_bytep sp, dp;
2136             int sshift, dshift;
2137             int s_start, s_end, s_inc;
2138             png_uint_32 i;
2139 
2140             sp = row + (png_size_t)((row_info->width - 1) >> 2);
2141             dp = row + (png_size_t)((final_width - 1) >> 2);
2142 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
2143             if (transformations & PNG_PACKSWAP)
2144             {
2145                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
2146                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
2147                s_start = 6;
2148                s_end = 0;
2149                s_inc = -2;
2150             }
2151             else
2152 #endif
2153             {
2154                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
2155                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
2156                s_start = 0;
2157                s_end = 6;
2158                s_inc = 2;
2159             }
2160 
2161             for (i = row_info->width; i; i--)
2162             {
2163                png_byte v;
2164                int j;
2165 
2166                v = (png_byte)((*sp >> sshift) & 0x3);
2167                for (j = 0; j < png_pass_inc[pass]; j++)
2168                {
2169                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
2170                   *dp |= (png_byte)(v << dshift);
2171                   if (dshift == s_end)
2172                   {
2173                      dshift = s_start;
2174                      dp--;
2175                   }
2176                   else
2177                      dshift += s_inc;
2178                }
2179                if (sshift == s_end)
2180                {
2181                   sshift = s_start;
2182                   sp--;
2183                }
2184                else
2185                   sshift += s_inc;
2186             }
2187             break;
2188          }
2189 
2190          case 4:
2191          {
2192             png_bytep sp, dp;
2193             int sshift, dshift;
2194             int s_start, s_end, s_inc;
2195             png_uint_32 i;
2196 
2197             sp = row + (png_size_t)((row_info->width - 1) >> 1);
2198             dp = row + (png_size_t)((final_width - 1) >> 1);
2199 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
2200             if (transformations & PNG_PACKSWAP)
2201             {
2202                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
2203                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
2204                s_start = 4;
2205                s_end = 0;
2206                s_inc = -4;
2207             }
2208             else
2209 #endif
2210             {
2211                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
2212                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
2213                s_start = 0;
2214                s_end = 4;
2215                s_inc = 4;
2216             }
2217 
2218             for (i = row_info->width; i; i--)
2219             {
2220                png_byte v;
2221                int j;
2222 
2223                v = (png_byte)((*sp >> sshift) & 0xf);
2224                for (j = 0; j < png_pass_inc[pass]; j++)
2225                {
2226                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
2227                   *dp |= (png_byte)(v << dshift);
2228                   if (dshift == s_end)
2229                   {
2230                      dshift = s_start;
2231                      dp--;
2232                   }
2233                   else
2234                      dshift += s_inc;
2235                }
2236                if (sshift == s_end)
2237                {
2238                   sshift = s_start;
2239                   sp--;
2240                }
2241                else
2242                   sshift += s_inc;
2243             }
2244             break;
2245          }
2246 
2247        /*====================================================================*/
2248 
2249          default: /* 8-bit or larger (this is where the routine is modified) */
2250          {
2251             png_bytep sptr, dp;
2252             png_uint_32 i;
2253             png_size_t pixel_bytes;
2254             int width = (int)row_info->width;
2255 
2256             pixel_bytes = (row_info->pixel_depth >> 3);
2257 
2258             /* point sptr at the last pixel in the pre-expanded row: */
2259             sptr = row + (width - 1) * pixel_bytes;
2260 
2261             /* point dp at the last pixel position in the expanded row: */
2262             dp = row + (final_width - 1) * pixel_bytes;
2263 
2264             /* New code by Nirav Chhatrapati - Intel Corporation */
2265 
2266 #if !defined(PNG_1_0_X)
2267             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
2268 #else
2269             if (_mmx_supported)
2270 #endif
2271             {
2272                int dummy_value_c;        // fix 'forbidden register spilled'
2273                png_bytep dummy_value_S;
2274                png_bytep dummy_value_D;
2275                png_bytep dummy_value_a;
2276                png_bytep dummy_value_d;
2277 
2278                //--------------------------------------------------------------
2279                if (pixel_bytes == BPP3)
2280                {
2281                   if (((pass == 4) || (pass == 5)) && width)
2282                   {
2283                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
2284                      if (width_mmx < 0)
2285                          width_mmx = 0;
2286                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
2287                      if (width_mmx)
2288                      {
2289                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
2290                         // sptr points at last pixel in pre-expanded row
2291                         // dp points at last pixel position in expanded row
2292                         __asm__ __volatile__ (
2293                            "sub  $3, %1             \n\t"
2294                            "sub  $9, %2             \n\t"
2295                                         // (png_pass_inc[pass] + 1)*pixel_bytes
2296 
2297                         ".loop3_pass4:              \n\t"
2298                            "movq (%1), %%mm0        \n\t" // x x 5 4 3 2 1 0
2299                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
2300                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
2301                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
2302                            "pand (%3), %%mm1        \n\t" // z z z z z 2 1 0
2303                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
2304                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
2305                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
2306                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
2307                            "movq %%mm0, (%2)        \n\t"
2308                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
2309                            "pand (%4), %%mm3        \n\t" // z z z z z z z 5
2310                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
2311                            "sub  $6, %1             \n\t"
2312                            "movd %%mm2, 8(%2)       \n\t"
2313                            "sub  $12, %2            \n\t"
2314                            "subl $2, %%ecx          \n\t"
2315                            "jnz .loop3_pass4        \n\t"
2316                            "EMMS                    \n\t" // DONE
2317 
2318                            : "=c" (dummy_value_c),        // output regs (dummy)
2319                              "=S" (dummy_value_S),
2320                              "=D" (dummy_value_D),
2321                              "=a" (dummy_value_a),
2322                              "=d" (dummy_value_d)
2323 
2324                            : "0" (width_mmx),     // ecx  // input regs
2325                              "1" (sptr),          // esi/rsi
2326                              "2" (dp),            // edi/rdi
2327 #if defined(PNG_x86_64_USE_GOTPCREL)     // formerly _const4 and _const6:
2328                              "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
2329                              "4" (&_c64._amask7_1_0)  // (0x00000000000000FFLL)
2330 #else
2331                              "3" (&_amask5_3_0),  // eax (0x0000000000FFFFFFLL)
2332                              "4" (&_amask7_1_0)   // edx (0x00000000000000FFLL)
2333 #endif
2334 
2335 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2336                            : "%mm0", "%mm1"               // clobber list
2337                            , "%mm2", "%mm3"
2338 #endif
2339                         );
2340                      }
2341 
2342                      sptr -= width_mmx*BPP3;
2343                      dp -= width_mmx*2*BPP3;
2344                      for (i = width; i; i--)
2345                      {
2346                         png_byte v[8];
2347                         int j;
2348 
2349                         png_memcpy(v, sptr, BPP3);
2350                         for (j = 0; j < png_pass_inc[pass]; j++)
2351                         {
2352                            png_memcpy(dp, v, BPP3);
2353                            dp -= BPP3;
2354                         }
2355                         sptr -= BPP3;
2356                      }
2357                   }
2358                   else if (((pass == 2) || (pass == 3)) && width)
2359                   {
2360                      __asm__ __volatile__ (
2361                         "sub  $9, %2             \n\t"
2362                                      // (png_pass_inc[pass] - 1)*pixel_bytes
2363 
2364                      ".loop3_pass2:              \n\t"
2365                         "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
2366                         "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
2367                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
2368                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
2369                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
2370                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
2371                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
2372                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
2373                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
2374                         "movq %%mm0, 4(%2)       \n\t"
2375                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
2376                         "sub  $3, %1             \n\t"
2377                         "movd %%mm0, (%2)        \n\t"
2378                         "sub  $12, %2            \n\t"
2379                         "decl %%ecx              \n\t"
2380                         "jnz .loop3_pass2        \n\t"
2381                         "EMMS                    \n\t" // DONE
2382 
2383                         : "=c" (dummy_value_c),        // output regs (dummy)
2384                           "=S" (dummy_value_S),
2385                           "=D" (dummy_value_D),
2386                           "=a" (dummy_value_a)
2387 
2388                         : "0" (width),         // ecx  // input regs
2389                           "1" (sptr),          // esi/rsi
2390                           "2" (dp),            // edi/rdi
2391 #if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
2392                           "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
2393 #else
2394                           "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
2395 #endif
2396 
2397 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
2398                         : "%mm0", "%mm1", "%mm2"       // clobber list
2399 #endif
2400                      );
2401                   }
2402                   else if (width)  // && ((pass == 0) || (pass == 1))
2403                   {
2404                      __asm__ __volatile__ (
2405                         "sub  $21, %2            \n\t"
2406                                      // (png_pass_inc[pass] - 1)*pixel_bytes
2407 
2408                      ".loop3_pass0:              \n\t"
2409                         "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
2410                         "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
2411                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
2412                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
2413                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
2414                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
2415                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
2416                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
2417                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
2418                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
2419                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
2420                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
2421                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
2422                         "movq %%mm4, 16(%2)      \n\t"
2423                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
2424                         "movq %%mm3, 8(%2)       \n\t"
2425                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
2426                         "sub  $3, %1             \n\t"
2427                         "movq %%mm0, (%2)        \n\t"
2428                         "sub  $24, %2            \n\t"
2429                         "decl %%ecx              \n\t"
2430                         "jnz .loop3_pass0        \n\t"
2431                         "EMMS                    \n\t" // DONE
2432 
2433                         : "=c" (dummy_value_c),        // output regs (dummy)
2434                           "=S" (dummy_value_S),
2435                           "=D" (dummy_value_D),
2436                           "=a" (dummy_value_a)
2437 
2438                         : "0" (width),         // ecx  // input regs
2439                           "1" (sptr),          // esi/rsi
2440                           "2" (dp),            // edi/rdi
2441 #if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
2442                           "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
2443 #else
2444                           "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
2445 #endif
2446 
2447 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2448                         : "%mm0", "%mm1", "%mm2"       // clobber list
2449                         , "%mm3", "%mm4"
2450 #endif
2451                      );
2452                   }
2453                } /* end of pixel_bytes == 3 */
2454 
2455                //--------------------------------------------------------------
2456                else if (pixel_bytes == BPP4)
2457                {
2458                   if (((pass == 4) || (pass == 5)) && width)
2459                   {
2460                      int width_mmx = ((width >> 1) << 1) ;
2461                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2462                      if (width_mmx)
2463                      {
2464                         __asm__ __volatile__ (
2465                            "sub  $4, %1             \n\t"
2466                            "sub  $12, %2            \n\t"
2467 
2468                         ".loop4_pass4:              \n\t"
2469                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2470                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2471                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2472                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2473                            "movq %%mm0, (%2)        \n\t"
2474                            "sub  $8, %1             \n\t"
2475                            "movq %%mm1, 8(%2)       \n\t"
2476                            "sub  $16, %2            \n\t"
2477                            "subl $2, %%ecx          \n\t"
2478                            "jnz .loop4_pass4        \n\t"
2479                            "EMMS                    \n\t" // DONE
2480 
2481                            : "=c" (dummy_value_c),        // output regs (dummy)
2482                              "=S" (dummy_value_S),
2483                              "=D" (dummy_value_D)
2484 
2485                            : "0" (width_mmx),     // ecx  // input regs
2486                              "1" (sptr),          // esi/rsi
2487                              "2" (dp)             // edi/rdi
2488 
2489 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
2490                            : "%mm0", "%mm1"               // clobber list
2491 #endif
2492                         );
2493                      }
2494 
2495                      sptr -= (width_mmx*BPP4 - BPP4); // sign fixed
2496                      dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed
2497                      for (i = width; i; i--)
2498                      {
2499                         png_byte v[8];
2500                         int j;
2501                         sptr -= BPP4;
2502                         png_memcpy(v, sptr, BPP4);
2503                         for (j = 0; j < png_pass_inc[pass]; j++)
2504                         {
2505                            dp -= BPP4;
2506                            png_memcpy(dp, v, BPP4);
2507                         }
2508                      }
2509                   }
2510                   else if (((pass == 2) || (pass == 3)) && width)
2511                   {
2512                      int width_mmx = ((width >> 1) << 1);
2513                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2514                      if (width_mmx)
2515                      {
2516                         __asm__ __volatile__ (
2517                            "sub  $4, %1             \n\t"
2518                            "sub  $28, %2            \n\t"
2519 
2520                         ".loop4_pass2:              \n\t"
2521                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2522                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2523                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2524                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2525                            "movq %%mm0, (%2)        \n\t"
2526                            "movq %%mm0, 8(%2)       \n\t"
2527                            "movq %%mm1, 16(%2)      \n\t"
2528                            "movq %%mm1, 24(%2)      \n\t"
2529                            "sub  $8, %1             \n\t"
2530                            "sub  $32, %2            \n\t"
2531                            "subl $2, %%ecx          \n\t"
2532                            "jnz .loop4_pass2        \n\t"
2533                            "EMMS                    \n\t" // DONE
2534 
2535                            : "=c" (dummy_value_c),        // output regs (dummy)
2536                              "=S" (dummy_value_S),
2537                              "=D" (dummy_value_D)
2538 
2539                            : "0" (width_mmx),     // ecx  // input regs
2540                              "1" (sptr),          // esi/rsi
2541                              "2" (dp)             // edi/rdi
2542 
2543 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
2544                            : "%mm0", "%mm1"               // clobber list
2545 #endif
2546                         );
2547                      }
2548 
2549                      sptr -= (width_mmx*4 - 4); // sign fixed
2550                      dp -= (width_mmx*16 - 4);  // sign fixed
2551                      for (i = width; i; i--)
2552                      {
2553                         png_byte v[8];
2554                         int j;
2555                         sptr -= 4;
2556                         png_memcpy(v, sptr, 4);
2557                         for (j = 0; j < png_pass_inc[pass]; j++)
2558                         {
2559                            dp -= 4;
2560                            png_memcpy(dp, v, 4);
2561                         }
2562                      }
2563                   }
2564                   else if (width)  // && ((pass == 0) || (pass == 1))
2565                   {
2566                      int width_mmx = ((width >> 1) << 1);
2567                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2568                      if (width_mmx)
2569                      {
2570                         __asm__ __volatile__ (
2571                            "sub  $4, %1             \n\t"
2572                            "sub  $60, %2            \n\t"
2573 
2574                         ".loop4_pass0:              \n\t"
2575                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2576                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2577                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2578                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2579                            "movq %%mm0, (%2)        \n\t"
2580                            "movq %%mm0, 8(%2)       \n\t"
2581                            "movq %%mm0, 16(%2)      \n\t"
2582                            "movq %%mm0, 24(%2)      \n\t"
2583                            "movq %%mm1, 32(%2)      \n\t"
2584                            "movq %%mm1, 40(%2)      \n\t"
2585                            "movq %%mm1, 48(%2)      \n\t"
2586                            "sub  $8, %1             \n\t"
2587                            "movq %%mm1, 56(%2)      \n\t"
2588                            "sub  $64, %2            \n\t"
2589                            "subl $2, %%ecx          \n\t"
2590                            "jnz .loop4_pass0        \n\t"
2591                            "EMMS                    \n\t" // DONE
2592 
2593                            : "=c" (dummy_value_c),        // output regs (dummy)
2594                              "=S" (dummy_value_S),
2595                              "=D" (dummy_value_D)
2596 
2597                            : "0" (width_mmx),     // ecx  // input regs
2598                              "1" (sptr),          // esi/rsi
2599                              "2" (dp)             // edi/rdi
2600 
2601 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2602                            : "%mm0", "%mm1"               // clobber list
2603 #endif
2604                         );
2605                      }
2606 
2607                      sptr -= (width_mmx*4 - 4); // sign fixed
2608                      dp -= (width_mmx*32 - 4);  // sign fixed
2609                      for (i = width; i; i--)
2610                      {
2611                         png_byte v[8];
2612                         int j;
2613                         sptr -= 4;
2614                         png_memcpy(v, sptr, 4);
2615                         for (j = 0; j < png_pass_inc[pass]; j++)
2616                         {
2617                            dp -= 4;
2618                            png_memcpy(dp, v, 4);
2619                         }
2620                      }
2621                   }
2622                } /* end of pixel_bytes == 4 */
2623 
2624                //--------------------------------------------------------------
2625                else if (pixel_bytes == 1)
2626                {
2627                   if (((pass == 4) || (pass == 5)) && width)
2628                   {
2629                      int width_mmx = ((width >> 3) << 3);
2630                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2631                      if (width_mmx)
2632                      {
2633                         __asm__ __volatile__ (
2634                            "sub  $7, %1             \n\t"
2635                            "sub  $15, %2            \n\t"
2636 
2637                         ".loop1_pass4:              \n\t"
2638                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2639                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2640                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2641                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
2642                            "movq %%mm1, 8(%2)       \n\t"
2643                            "sub  $8, %1             \n\t"
2644                            "movq %%mm0, (%2)        \n\t"
2645                            "sub  $16, %2            \n\t"
2646                            "subl $8, %%ecx          \n\t"
2647                            "jnz .loop1_pass4        \n\t"
2648                            "EMMS                    \n\t" // DONE
2649 
2650                            : "=c" (dummy_value_c),        // output regs (dummy)
2651                              "=S" (dummy_value_S),
2652                              "=D" (dummy_value_D)
2653 
2654                            : "0" (width_mmx),     // ecx  // input regs
2655                              "1" (sptr),          // esi/rsi
2656                              "2" (dp)             // edi/rdi
2657 
2658 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2659                            : "%mm0", "%mm1"               // clobber list
2660 #endif
2661                         );
2662                      }
2663 
2664                      sptr -= width_mmx;
2665                      dp -= width_mmx*2;
2666                      for (i = width; i; i--)
2667                      {
2668                         int j;
2669 
2670                         for (j = 0; j < png_pass_inc[pass]; j++)
2671                         {
2672                            *dp-- = *sptr;
2673                         }
2674                         --sptr;
2675                      }
2676                   }
2677                   else if (((pass == 2) || (pass == 3)) && width)
2678                   {
2679                      int width_mmx = ((width >> 2) << 2);
2680                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2681                      if (width_mmx)
2682                      {
2683                         __asm__ __volatile__ (
2684                            "sub  $3, %1             \n\t"
2685                            "sub  $15, %2            \n\t"
2686 
2687                         ".loop1_pass2:              \n\t"
2688                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2689                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2690                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
2691                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
2692                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
2693                            "movq %%mm0, (%2)        \n\t"
2694                            "sub  $4, %1             \n\t"
2695                            "movq %%mm1, 8(%2)       \n\t"
2696                            "sub  $16, %2            \n\t"
2697                            "subl $4, %%ecx          \n\t"
2698                            "jnz .loop1_pass2        \n\t"
2699                            "EMMS                    \n\t" // DONE
2700 
2701                            : "=c" (dummy_value_c),        // output regs (dummy)
2702                              "=S" (dummy_value_S),
2703                              "=D" (dummy_value_D)
2704 
2705                            : "0" (width_mmx),     // ecx  // input regs
2706                              "1" (sptr),          // esi/rsi
2707                              "2" (dp)             // edi/rdi
2708 
2709 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2710                            : "%mm0", "%mm1"               // clobber list
2711 #endif
2712                         );
2713                      }
2714 
2715                      sptr -= width_mmx;
2716                      dp -= width_mmx*4;
2717                      for (i = width; i; i--)
2718                      {
2719                         int j;
2720 
2721                         for (j = 0; j < png_pass_inc[pass]; j++)
2722                         {
2723                            *dp-- = *sptr;
2724                         }
2725                         --sptr;
2726                      }
2727                   }
2728                   else if (width)  // && ((pass == 0) || (pass == 1))
2729                   {
2730                      int width_mmx = ((width >> 2) << 2);
2731                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2732                      if (width_mmx)
2733                      {
2734                         __asm__ __volatile__ (
2735                            "sub  $3, %1             \n\t"
2736                            "sub  $31, %2            \n\t"
2737 
2738                         ".loop1_pass0:              \n\t"
2739                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2740                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
2741                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2742                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
2743                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
2744                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
2745                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
2746                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
2747                            "movq %%mm0, (%2)        \n\t"
2748                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
2749                            "movq %%mm3, 8(%2)       \n\t"
2750                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
2751                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
2752                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
2753                            "movq %%mm2, 16(%2)      \n\t"
2754                            "sub  $4, %1             \n\t"
2755                            "movq %%mm4, 24(%2)      \n\t"
2756                            "sub  $32, %2            \n\t"
2757                            "subl $4, %%ecx          \n\t"
2758                            "jnz .loop1_pass0        \n\t"
2759                            "EMMS                    \n\t" // DONE
2760 
2761                            : "=c" (dummy_value_c),        // output regs (dummy)
2762                              "=S" (dummy_value_S),
2763                              "=D" (dummy_value_D)
2764 
2765                            : "0" (width_mmx),     // ecx  // input regs
2766                              "1" (sptr),          // esi/rsi
2767                              "2" (dp)             // edi/rdi
2768 
2769 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2770                            : "%mm0", "%mm1", "%mm2"       // clobber list
2771                            , "%mm3", "%mm4"
2772 #endif
2773                         );
2774                      }
2775 
2776                      sptr -= width_mmx;
2777                      dp -= width_mmx*8;
2778                      for (i = width; i; i--)
2779                      {
2780                         int j;
2781 
2782                        /* I simplified this part in version 1.0.4e
2783                         * here and in several other instances where
2784                         * pixel_bytes == 1  -- GR-P
2785                         *
2786                         * Original code:
2787                         *
2788                         * png_byte v[8];
2789                         * png_memcpy(v, sptr, pixel_bytes);
2790                         * for (j = 0; j < png_pass_inc[pass]; j++)
2791                         * {
2792                         *    png_memcpy(dp, v, pixel_bytes);
2793                         *    dp -= pixel_bytes;
2794                         * }
2795                         * sptr -= pixel_bytes;
2796                         *
2797                         * Replacement code is in the next three lines:
2798                         */
2799 
2800                         for (j = 0; j < png_pass_inc[pass]; j++)
2801                         {
2802                            *dp-- = *sptr;
2803                         }
2804                         --sptr;
2805                      }
2806                   }
2807                } /* end of pixel_bytes == 1 */
2808 
2809                //--------------------------------------------------------------
2810                else if (pixel_bytes == BPP2)
2811                {
2812                   if (((pass == 4) || (pass == 5)) && width)
2813                   {
2814                      int width_mmx = ((width >> 1) << 1) ;
2815                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2816                      if (width_mmx)
2817                      {
2818                         __asm__ __volatile__ (
2819                            "sub  $2, %1             \n\t"
2820                            "sub  $6, %2             \n\t"
2821 
2822                         ".loop2_pass4:              \n\t"
2823                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2824                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2825                            "sub  $4, %1             \n\t"
2826                            "movq %%mm0, (%2)        \n\t"
2827                            "sub  $8, %2             \n\t"
2828                            "subl $2, %%ecx          \n\t"
2829                            "jnz .loop2_pass4        \n\t"
2830                            "EMMS                    \n\t" // DONE
2831 
2832                            : "=c" (dummy_value_c),        // output regs (dummy)
2833                              "=S" (dummy_value_S),
2834                              "=D" (dummy_value_D)
2835 
2836                            : "0" (width_mmx),     // ecx  // input regs
2837                              "1" (sptr),          // esi/rsi
2838                              "2" (dp)             // edi/rdi
2839 
2840 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2841                            : "%mm0"                       // clobber list
2842 #endif
2843                         );
2844                      }
2845 
2846                      sptr -= (width_mmx*BPP2 - BPP2); // sign fixed
2847                      dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed
2848                      for (i = width; i; i--)
2849                      {
2850                         png_byte v[8];
2851                         int j;
2852                         sptr -= BPP2;
2853                         png_memcpy(v, sptr, BPP2);
2854                         for (j = 0; j < png_pass_inc[pass]; j++)
2855                         {
2856                            dp -= BPP2;
2857                            png_memcpy(dp, v, BPP2);
2858                         }
2859                      }
2860                   }
2861                   else if (((pass == 2) || (pass == 3)) && width)
2862                   {
2863                      int width_mmx = ((width >> 1) << 1) ;
2864                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2865                      if (width_mmx)
2866                      {
2867                         __asm__ __volatile__ (
2868                            "sub  $2, %1             \n\t"
2869                            "sub  $14, %2            \n\t"
2870 
2871                         ".loop2_pass2:              \n\t"
2872                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2873                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2874                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2875                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2876                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2877                            "movq %%mm0, (%2)        \n\t"
2878                            "sub  $4, %1             \n\t"
2879                            "movq %%mm1, 8(%2)       \n\t"
2880                            "sub  $16, %2            \n\t"
2881                            "subl $2, %%ecx          \n\t"
2882                            "jnz .loop2_pass2        \n\t"
2883                            "EMMS                    \n\t" // DONE
2884 
2885                            : "=c" (dummy_value_c),        // output regs (dummy)
2886                              "=S" (dummy_value_S),
2887                              "=D" (dummy_value_D)
2888 
2889                            : "0" (width_mmx),     // ecx  // input regs
2890                              "1" (sptr),          // esi/rsi
2891                              "2" (dp)             // edi/rdi
2892 
2893 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
2894                            : "%mm0", "%mm1"               // clobber list
2895 #endif
2896                         );
2897                      }
2898 
2899                      sptr -= (width_mmx*2 - 2); // sign fixed
2900                      dp -= (width_mmx*8 - 2);   // sign fixed
2901                      for (i = width; i; i--)
2902                      {
2903                         png_byte v[8];
2904                         int j;
2905                         sptr -= 2;
2906                         png_memcpy(v, sptr, 2);
2907                         for (j = 0; j < png_pass_inc[pass]; j++)
2908                         {
2909                            dp -= 2;
2910                            png_memcpy(dp, v, 2);
2911                         }
2912                      }
2913                   }
2914                   else if (width)  // && ((pass == 0) || (pass == 1))
2915                   {
2916                      int width_mmx = ((width >> 1) << 1);
2917                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2918                      if (width_mmx)
2919                      {
2920                         __asm__ __volatile__ (
2921                            "sub  $2, %1             \n\t"
2922                            "sub  $30, %2            \n\t"
2923 
2924                         ".loop2_pass0:              \n\t"
2925                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2926                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2927                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2928                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2929                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2930                            "movq %%mm0, (%2)        \n\t"
2931                            "movq %%mm0, 8(%2)       \n\t"
2932                            "movq %%mm1, 16(%2)      \n\t"
2933                            "sub  $4, %1             \n\t"
2934                            "movq %%mm1, 24(%2)      \n\t"
2935                            "sub  $32, %2            \n\t"
2936                            "subl $2, %%ecx          \n\t"
2937                            "jnz .loop2_pass0        \n\t"
2938                            "EMMS                    \n\t" // DONE
2939 
2940                            : "=c" (dummy_value_c),        // output regs (dummy)
2941                              "=S" (dummy_value_S),
2942                              "=D" (dummy_value_D)
2943 
2944                            : "0" (width_mmx),     // ecx  // input regs
2945                              "1" (sptr),          // esi/rsi
2946                              "2" (dp)             // edi/rdi
2947 
2948 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
2949                            : "%mm0", "%mm1"               // clobber list
2950 #endif
2951                         );
2952                      }
2953 
2954                      sptr -= (width_mmx*2 - 2); // sign fixed
2955                      dp -= (width_mmx*16 - 2);  // sign fixed
2956                      for (i = width; i; i--)
2957                      {
2958                         png_byte v[8];
2959                         int j;
2960                         sptr -= 2;
2961                         png_memcpy(v, sptr, 2);
2962                         for (j = 0; j < png_pass_inc[pass]; j++)
2963                         {
2964                            dp -= 2;
2965                            png_memcpy(dp, v, 2);
2966                         }
2967                      }
2968                   }
2969                } /* end of pixel_bytes == 2 */
2970 
2971                //--------------------------------------------------------------
2972                else if (pixel_bytes == BPP8)
2973                {
2974 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2975                   // GRR NOTE:  no need to combine passes here!
2976                   if (((pass == 4) || (pass == 5)) && width)
2977                   {
2978                      // source is 8-byte RRGGBBAA
2979                      // dest is 16-byte RRGGBBAA RRGGBBAA
2980                      __asm__ __volatile__ (
2981                         "sub  $8, %2             \n\t" // start of last block
2982 
2983                      ".loop8_pass4:              \n\t"
2984                         "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2985                         "movq %%mm0, (%2)        \n\t"
2986                         "sub  $8, %1             \n\t"
2987                         "movq %%mm0, 8(%2)       \n\t"
2988                         "sub  $16, %2            \n\t"
2989                         "decl %%ecx              \n\t"
2990                         "jnz .loop8_pass4        \n\t"
2991                         "EMMS                    \n\t" // DONE
2992 
2993                         : "=c" (dummy_value_c),        // output regs (dummy)
2994                           "=S" (dummy_value_S),
2995                           "=D" (dummy_value_D)
2996 
2997                         : "0" (width),         // ecx  // input regs
2998                           "1" (sptr),          // esi/rsi
2999                           "2" (dp)             // edi/rdi
3000 
3001 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
3002                         : "%mm0"                       // clobber list
3003 #endif
3004                      );
3005                   }
3006                   else if (((pass == 2) || (pass == 3)) && width)
3007                   {
3008                      // source is 8-byte RRGGBBAA
3009                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
3010                      // (recall that expansion is _in place_:  sptr and dp
3011                      //  both point at locations within same row buffer)
3012                      __asm__ __volatile__ (
3013                         "sub  $24, %2            \n\t" // start of last block
3014 
3015                      ".loop8_pass2:              \n\t"
3016                         "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
3017                         "movq %%mm0, (%2)        \n\t"
3018                         "movq %%mm0, 8(%2)       \n\t"
3019                         "movq %%mm0, 16(%2)      \n\t"
3020                         "sub  $8, %1             \n\t"
3021                         "movq %%mm0, 24(%2)      \n\t"
3022                         "sub  $32, %2            \n\t"
3023                         "decl %%ecx              \n\t"
3024                         "jnz .loop8_pass2        \n\t"
3025                         "EMMS                    \n\t" // DONE
3026 
3027                         : "=c" (dummy_value_c),        // output regs (dummy)
3028                           "=S" (dummy_value_S),
3029                           "=D" (dummy_value_D)
3030 
3031                         : "0" (width),         // ecx  // input regs
3032                           "1" (sptr),          // esi/rsi
3033                           "2" (dp)             // edi/rdi
3034 
3035 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
3036                         : "%mm0"                       // clobber list
3037 #endif
3038                      );
3039                   }
3040                   else if (width)  // && ((pass == 0) || (pass == 1))
3041                   {
3042                      // source is 8-byte RRGGBBAA
3043                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
3044                      __asm__ __volatile__ (
3045                         "sub  $56, %2            \n\t" // start of last block
3046 
3047                      ".loop8_pass0:              \n\t"
3048                         "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
3049                         "movq %%mm0, (%2)        \n\t"
3050                         "movq %%mm0, 8(%2)       \n\t"
3051                         "movq %%mm0, 16(%2)      \n\t"
3052                         "movq %%mm0, 24(%2)      \n\t"
3053                         "movq %%mm0, 32(%2)      \n\t"
3054                         "movq %%mm0, 40(%2)      \n\t"
3055                         "movq %%mm0, 48(%2)      \n\t"
3056                         "sub  $8, %1             \n\t"
3057                         "movq %%mm0, 56(%2)      \n\t"
3058                         "sub  $64, %2            \n\t"
3059                         "decl %%ecx              \n\t"
3060                         "jnz .loop8_pass0        \n\t"
3061                         "EMMS                    \n\t" // DONE
3062 
3063                         : "=c" (dummy_value_c),        // output regs (dummy)
3064                           "=S" (dummy_value_S),
3065                           "=D" (dummy_value_D)
3066 
3067                         : "0" (width),         // ecx  // input regs
3068                           "1" (sptr),          // esi/rsi
3069                           "2" (dp)             // edi/rdi
3070 
3071 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
3072                         : "%mm0"                       // clobber list
3073 #endif
3074                      );
3075                   }
3076                } /* end of pixel_bytes == 8 */
3077 
3078                //--------------------------------------------------------------
3079                else if (pixel_bytes == BPP6)   // why no MMX for this case?
3080                {
3081                   for (i = width; i; i--)
3082                   {
3083                      png_byte v[8];
3084                      int j;
3085                      png_memcpy(v, sptr, BPP6);
3086                      for (j = 0; j < png_pass_inc[pass]; j++)
3087                      {
3088                         png_memcpy(dp, v, BPP6);
3089                         dp -= BPP6;
3090                      }
3091                      sptr -= BPP6;
3092                   }
3093                } /* end of pixel_bytes == 6 */
3094 
3095                //--------------------------------------------------------------
3096                else
3097                {
3098                   // ERROR:  SHOULD NEVER BE REACHED
3099 #if defined(PNG_DEBUG)
3100                   png_debug(1, "Internal libpng logic error (GCC "
3101                     "png_do_read_interlace() _mmx_supported)\n");
3102 #endif
3103                }
3104 
3105             } // end of _mmx_supported ========================================
3106 
3107             else /* MMX not supported:  use modified C code - takes advantage
3108                   *   of inlining of png_memcpy for a constant */
3109             {
3110                if (pixel_bytes == BPP3)
3111                {
3112                   for (i = width; i; i--)
3113                   {
3114                      png_byte v[8];
3115                      int j;
3116                      png_memcpy(v, sptr, BPP3);
3117                      for (j = 0; j < png_pass_inc[pass]; j++)
3118                      {
3119                         png_memcpy(dp, v, BPP3);
3120                         dp -= BPP3;
3121                      }
3122                      sptr -= BPP3;
3123                   }
3124                }
3125                else if (pixel_bytes == BPP4)
3126                {
3127                   for (i = width; i; i--)
3128                   {
3129                      png_byte v[8];
3130                      int j;
3131                      png_memcpy(v, sptr, BPP4);
3132                      for (j = 0; j < png_pass_inc[pass]; j++)
3133                      {
3134 #if defined(PNG_DEBUG) && defined(PNG_1_0_X)  // row_buf_size gone in 1.2.x
3135                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
3136                         {
3137                            printf("dp out of bounds: row=%10p, dp=%10p, "
3138                              "rp=%10p\n", row, dp, row+png_ptr->row_buf_size);
3139                            printf("row_buf_size=%lu\n", png_ptr->row_buf_size);
3140                         }
3141 #endif
3142                         png_memcpy(dp, v, BPP4);
3143                         dp -= BPP4;
3144                      }
3145                      sptr -= BPP4;
3146                   }
3147                }
3148                else if (pixel_bytes == 1)
3149                {
3150                   for (i = width; i; i--)
3151                   {
3152                      int j;
3153                      for (j = 0; j < png_pass_inc[pass]; j++)
3154                      {
3155                         *dp-- = *sptr;
3156                      }
3157                      --sptr;
3158                   }
3159                }
3160                else if (pixel_bytes == BPP2)
3161                {
3162                   for (i = width; i; i--)
3163                   {
3164                      png_byte v[8];
3165                      int j;
3166                      png_memcpy(v, sptr, BPP2);
3167                      for (j = 0; j < png_pass_inc[pass]; j++)
3168                      {
3169                         png_memcpy(dp, v, BPP2);
3170                         dp -= BPP2;
3171                      }
3172                      sptr -= BPP2;
3173                   }
3174                }
3175                else if (pixel_bytes == BPP6)
3176                {
3177                   for (i = width; i; i--)
3178                   {
3179                      png_byte v[8];
3180                      int j;
3181                      png_memcpy(v, sptr, BPP6);
3182                      for (j = 0; j < png_pass_inc[pass]; j++)
3183                      {
3184                         png_memcpy(dp, v, BPP6);
3185                         dp -= BPP6;
3186                      }
3187                      sptr -= BPP6;
3188                   }
3189                }
3190                else if (pixel_bytes == BPP8)
3191                {
3192                   for (i = width; i; i--)
3193                   {
3194                      png_byte v[8];
3195                      int j;
3196                      png_memcpy(v, sptr, BPP8);
3197                      for (j = 0; j < png_pass_inc[pass]; j++)
3198                      {
3199                         png_memcpy(dp, v, BPP8);
3200                         dp -= BPP8;
3201                      }
3202                      sptr -= BPP8;
3203                   }
3204                }
3205                else
3206                {
3207                   // ERROR:  SHOULD NEVER BE REACHED
3208 #if defined(PNG_DEBUG)
3209                   png_debug(1, "Internal libpng logic error (GCC "
3210                     "png_do_read_interlace() !_mmx_supported)\n");
3211 #endif
3212                }
3213 
3214             } /* end if (MMX not supported) */
3215             break;
3216          } /* end default (8-bit or larger) */
3217       } /* end switch (row_info->pixel_depth) */
3218 
3219       row_info->width = final_width;
3220 
3221       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
3222    }
3223 
3224 } /* end png_do_read_interlace() */
3225 
3226 #endif /* PNG_HAVE_MMX_READ_INTERLACE */
3227 #endif /* PNG_READ_INTERLACING_SUPPORTED */
3228 
3229 
3230 
3231 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
3232 #if defined(PNG_MMX_READ_FILTER_AVG_SUPPORTED)
3233 
3234 //===========================================================================//
3235 //                                                                           //
3236 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
3237 //                                                                           //
3238 //===========================================================================//
3239 
3240 // Optimized code for PNG Average filter decoder
3241 
3242 static void /* PRIVATE */
png_read_filter_row_mmx_avg(png_row_infop row_info,png_bytep row,png_bytep prev_row)3243 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
3244                             png_bytep prev_row)
3245 {
3246    unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
3247    int bpp;
3248    int dummy_value_a;
3249    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3250    int dummy_value_d;
3251    png_bytep dummy_value_S;
3252    png_bytep dummy_value_D;
3253    int diff; //     __attribute__((used));
3254 
3255    bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
3256    FullLength = row_info->rowbytes;         // number of bytes to filter
3257 
3258    __asm__ __volatile__ (
3259    "avg_top:                       \n\t"
3260       SAVE_GOT_ebx
3261       SAVE_r15
3262       SAVE_ebp
3263       // initialize address pointers and offset
3264 //pre "movl row, %5                \n\t" // edi/rdi:  ptr to Avg(x)
3265       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
3266 //pre "movl prev_row, %4           \n\t" // esi/rsi:  ptr to Prior(x)
3267       "mov  %5, " PDX "            \n\t" // copy of row ptr...
3268 //pre "subl bpp, " PDX "           \n\t" // (bpp is preloaded into ecx)
3269       "sub  " PCX "," PDX "        \n\t" // edx/rdx:  ptr to Raw(x-bpp)
3270 //pre "movl FullLength, %%eax      \n\t" // bring in via eax...
3271       SAVE_FullLength                    // ...but store for later use
3272       "xorl %%eax, %%eax           \n\t"
3273 
3274       // Compute the Raw value for the first bpp bytes
3275       //    Raw(x) = Avg(x) + (Prior(x)/2)
3276    "avg_rlp:                       \n\t"
3277       "movb (%4," PBX ",), %%al    \n\t" // load al with Prior(x)
3278       "incl %%ebx                  \n\t"
3279       "shrb %%al                   \n\t" // divide by 2
3280       "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
3281 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
3282       "cmpl %%ecx, %%ebx           \n\t"
3283       "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
3284       "jb avg_rlp                  \n\t" // mov does not affect flags
3285 
3286       // get # of bytes to alignment (32-bit mask _would_ be good enough
3287       // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh)
3288       // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?)
3289       "mov  %5, " PBP "            \n\t" // take start of row
3290       "add  " PBX "," PBP "        \n\t" // add bpp
3291       "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
3292 //    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
3293       CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
3294       "sub  %5, " PBP "            \n\t" // subtract row ptr again => ebp =
3295       "jz avg_go                   \n\t" //  target value of ebx at alignment
3296 
3297       "xorl %%ecx, %%ecx           \n\t"
3298 
3299       // fix alignment
3300       // Compute the Raw value for the bytes up to the alignment boundary
3301       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3302    "avg_lp1:                       \n\t"
3303       "xorl %%eax, %%eax           \n\t"
3304       "movb (%4," PBX ",), %%cl    \n\t" // load cl with Prior(x)
3305       "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp)
3306       "addw %%cx, %%ax             \n\t"
3307       "incl %%ebx                  \n\t"
3308       "shrw %%ax                   \n\t" // divide by 2
3309       "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
3310       "cmpl %%ebp, %%ebx           \n\t" // check if at alignment boundary
3311       "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
3312       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
3313 
3314    "avg_go:                        \n\t"
3315       RESTORE_FullLength "%%eax    \n\t" // FullLength -> eax
3316       "movl %%eax, %%ecx           \n\t" // copy -> ecx
3317       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3318       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3319       "subl %%eax, %%ecx           \n\t" // sub over-bytes from original length
3320 //out "movl %%ecx, MMXLength       \n\t"
3321       "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
3322       RESTORE_ebp                        //  (could swap ebp and edx functions)
3323       RESTORE_r15
3324       RESTORE_GOT_ebx
3325 
3326 // "There is no way for you to specify that an input operand is modified
3327 // without also specifying it as an output operand."  [makes sense]
3328 
3329 // "Unless an output operand has the `&' constraint modifier, GCC may
3330 // allocate it in the same register as an unrelated input operand, on the
3331 // assumption the inputs are consumed before the outputs are produced."
3332 // [trying to _force_ this]
3333 
3334 // "`='   Means that this operand is write-only for this instruction:
3335 //        the previous value is discarded and replaced by output data."
3336 //        [operand == variable name, presumably]
3337 
3338       // output regs
3339       // these are operands 0-1 (originally 0-3):
3340       : "=c" (MMXLength),      // %0 -> %0
3341         "=a" (diff)            // %3 -> %1
3342 //      "=S" (dummy_value_S),  // %1 -> GONE
3343 //      "=D" (dummy_value_D),  // %2 -> GONE
3344 
3345       // input regs
3346       // these are operands 2-5 (originally 4-7); two of their constraints say
3347       // they must go in same places as operands 0-1 (originally 0-3) above:
3348       : "0" (bpp),         // %4 -> %2 ecx
3349         "1" (FullLength),  // %7 -> %3 eax
3350         "S" (prev_row),    // %5 -> %4 esi/rsi
3351         "D" (row)          // %6 -> %5 edi/rdi
3352 
3353       : "%edx"                           // clobber list
3354         _CLOBBER_r15
3355         _CLOBBER_ebp
3356         _CLOBBER_GOT_ebx
3357    );
3358 
3359    // now do the math for the rest of the row
3360    switch (bpp)
3361    {
3362       case 3:
3363       {
3364 //       _ShiftBpp = 24;    // == 3 * 8
3365 //       _ShiftRem = 40;    // == 64 - 24
3366 
3367          __asm__ __volatile__ (
3368             // re-init address pointers and offset
3369             LOAD_GOT_rbp
3370             "movq " AMASK5_3_0 ", %%mm7    \n\t" // _amask5_3_0 -> mm7
3371 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3372                                                  //  alignment boundary
3373             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
3374 // preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
3375             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3376 // preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
3377             RESTORE_rbp
3378 
3379             // prime the pump:  load the first Raw(x-bpp) data set
3380             "movq  -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes
3381                                                // (correct pos. in loop below)
3382          "avg_3lp:                        \n\t"
3383             "movq  (%1," PCX ",), %%mm0   \n\t" // load mm0 with Avg(x)
3384             "movq  %%mm5, %%mm3           \n\t"
3385             "psrlq $40, %%mm2             \n\t" // correct position Raw(x-bpp)
3386                                                 // data
3387             "movq  (%0," PCX ",), %%mm1   \n\t" // load mm1 with Prior(x)
3388             "movq  %%mm7, %%mm6           \n\t"
3389             "pand  %%mm1, %%mm3           \n\t" // get lsb for each prevrow byte
3390             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
3391             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
3392                                                 // byte
3393             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
3394                                                 // each byte
3395             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
3396             "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
3397                                                 // LBCarrys
3398             "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3399                                                 // where both lsb's were == 1
3400                                                 // (valid only for active group)
3401             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
3402             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
3403                                                 // byte
3404             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
3405                                                 // for each byte
3406             "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 1
3407                                                 // bytes to add to Avg
3408             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
3409                                                 // Avg for each Active byte
3410             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3411             "psllq $24, %%mm6             \n\t" // shift the mm6 mask to cover
3412                                                 // bytes 3-5
3413             "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3414             "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
3415             "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
3416                                                 // LBCarrys
3417             "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3418                                                 // where both lsb's were == 1
3419                                                 // (valid only for active group)
3420             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
3421             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
3422                                                 // byte
3423             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
3424                                                 // for each byte
3425             "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
3426                                                 // bytes to add to Avg
3427             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
3428                                                 // Avg for each Active byte
3429 
3430             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3431             "psllq $24, %%mm6             \n\t" // shift mm6 mask to cover last
3432                                                 // two bytes
3433             "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3434             "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
3435                               // Data need be shifted only once here to
3436                               // get the correct x-bpp offset.
3437             "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
3438                                                 // LBCarrys
3439             "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3440                                                 // where both
3441                               // lsb's were == 1 (only valid for active group)
3442             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
3443             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
3444                                                 // byte
3445             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
3446                                                 // for each byte
3447             "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
3448                                                 // bytes to add to Avg
3449             "addl  $8, %%ecx              \n\t"
3450             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
3451                                                 // Avg for each Active byte
3452             // now ready to write back to memory
3453             "movq  %%mm0, -8(%1," PCX ",) \n\t"
3454             // move updated Raw(x) to use as Raw(x-bpp) for next loop
3455             "cmpl  %%eax, %%ecx           \n\t" // MMXLength
3456             "movq  %%mm0, %%mm2           \n\t" // mov updated Raw(x) to mm2
3457             "jb avg_3lp                   \n\t"
3458 
3459             : "=S" (dummy_value_S),            // output regs (dummy)
3460               "=D" (dummy_value_D),
3461               "=c" (dummy_value_c),
3462               "=a" (dummy_value_a)
3463 
3464             : "0" (prev_row),    // esi/rsi    // input regs
3465               "1" (row),         // edi/rdi
3466               "2" (diff),        // ecx
3467               "3" (MMXLength)    // eax
3468 
3469 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
3470             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3471             , "%mm4", "%mm5", "%mm6", "%mm7"
3472 #endif
3473          );
3474       }
3475       break;  // end 3 bpp
3476 
3477       case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
3478       {         // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit
3479                 // mem (PIC/.so problems), MMX reg (none left), or immediate
3480 //       _ShiftBpp = bpp << 3;        // 32 (psllq)
3481 //       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
3482 
3483          __asm__ __volatile__ (
3484             LOAD_GOT_rbp
3485             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3486             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
3487             // re-init address pointers and offset
3488 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3489                                                  // alignment boundary
3490             "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
3491             RESTORE_rbp
3492 
3493             // ... and clear all bytes except for 1st active group
3494 // preload  "movl  row, %1               \n\t" // edi:  Avg(x)
3495             "psrlq $32, %%mm7            \n\t" // was _ShiftRem
3496 // preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
3497             "movq  %%mm7, %%mm6          \n\t"
3498             "psllq $32, %%mm6            \n\t" // mask for 2nd active group
3499 
3500             // prime the pump:  load the first Raw(x-bpp) data set
3501             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3502                                              // (we correct pos. in loop below)
3503          "avg_4lp:                       \n\t"
3504             "movq (%1," PCX ",), %%mm0   \n\t"
3505             "psrlq $32, %%mm2            \n\t" // shift data to pos. correctly
3506             "movq (%0," PCX ",), %%mm1   \n\t"
3507             // add (Prev_row/2) to average
3508             "movq %%mm5, %%mm3           \n\t"
3509             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3510             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3511             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3512                                                // byte
3513             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3514                                                // each byte
3515             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3516             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3517                                                // LBCarrys
3518             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3519                                                // where both
3520                               // lsb's were == 1 (only valid for active group)
3521             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3522             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3523                                                // byte
3524             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3525                                                // for each byte
3526             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
3527                                                // bytes to add to Avg
3528             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3529                                                // for each Active
3530                               // byte
3531             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3532             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3533             "psllq $32, %%mm2            \n\t" // shift data to pos. correctly
3534             "addl $8, %%ecx              \n\t"
3535             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3536                                                // LBCarrys
3537             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3538                                                // where both
3539                               // lsb's were == 1 (only valid for active group)
3540             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3541             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3542                                                // byte
3543             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3544                                                // for each byte
3545             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3546                                                // bytes to add to Avg
3547             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3548                                                // Avg for each Active byte
3549             "cmpl %%eax, %%ecx           \n\t" // MMXLength
3550             // now ready to write back to memory
3551             "movq %%mm0, -8(%1," PCX ",) \n\t"
3552             // prep Raw(x-bpp) for next loop
3553             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3554             "jb avg_4lp                  \n\t"
3555 
3556             : "=S" (dummy_value_S),            // output regs (dummy)
3557               "=D" (dummy_value_D),
3558               "=c" (dummy_value_c),
3559               "=a" (dummy_value_a)
3560 
3561             : "0" (prev_row),    // esi/rsi    // input regs
3562               "1" (row),         // edi/rdi
3563               "2" (diff),        // ecx
3564               "3" (MMXLength)    // eax
3565 
3566 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
3567             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3568             , "%mm4", "%mm5", "%mm6", "%mm7"
3569 #endif
3570          );
3571       }
3572       break;  // end 4 bpp
3573 
3574       case 1:
3575       {
3576          __asm__ __volatile__ (
3577             // re-init address pointers and offset
3578 // preload  "movl diff, %%ecx            \n\t" // ecx: x = offset to align. bdry
3579 // preload  "movl row, %1                \n\t" // edi/rdi:  Avg(x)
3580 // preload  "movl FullLength, %%eax      \n\t"
3581             "cmpl %%eax, %%ecx           \n\t" // test if offset at end of array
3582             "jnb avg_1end                \n\t"
3583 
3584             SAVE_ebp
3585 
3586             // do Avg decode for remaining bytes
3587 // preload  "movl prev_row, %0           \n\t" // esi/rsi:  Prior(x)
3588             "mov  %1, " PBP "            \n\t" // copy of row pointer...
3589             "dec  " PBP "                \n\t" // ebp/rbp:  Raw(x-bpp)
3590             "xorl %%edx, %%edx           \n\t" // zero edx before using dl & dx
3591                                                //  in loop below
3592             SAVE_GOT_ebx
3593 
3594          "avg_1lp:                       \n\t"
3595             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3596             "xorl %%ebx, %%ebx           \n\t"
3597             "movb (%0," PCX ",), %%dl    \n\t" // load dl with Prior(x)
3598             "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp)
3599             "addw %%dx, %%bx             \n\t"
3600             "incl %%ecx                  \n\t"
3601             "shrw %%bx                   \n\t" // divide by 2
3602             "addb -1(%1," PCX ",), %%bl  \n\t" // add Avg(x); -1 to offset
3603                                                // inc ecx
3604             "cmpl %%eax, %%ecx           \n\t" // check if at end of array
3605             "movb %%bl, -1(%1," PCX ",)  \n\t" // write back Raw(x);
3606                          // mov does not affect flags; -1 to offset inc ecx
3607             "jb avg_1lp                  \n\t"
3608 
3609             RESTORE_GOT_ebx
3610             RESTORE_ebp
3611 
3612          "avg_1end:                      \n\t"
3613 
3614             : "=S" (dummy_value_S),            // output regs (dummy)
3615               "=D" (dummy_value_D),
3616               "=c" (dummy_value_c),
3617               "=a" (dummy_value_a)
3618 
3619             : "0" (prev_row),    // esi/rsi    // input regs
3620               "1" (row),         // edi/rdi
3621               "2" (diff),        // ecx
3622               "3" (FullLength)   // eax
3623 
3624             : "%edx"                           // clobber list
3625               _CLOBBER_GOT_ebx
3626               _CLOBBER_ebp
3627          );
3628       }
3629       return;  // end 1 bpp
3630 
3631       case 2:
3632       {
3633 //       _ShiftBpp = 16;   // == 2 * 8
3634 //       _ShiftRem = 48;   // == 64 - _ShiftBpp
3635 
3636          __asm__ __volatile__ (
3637             LOAD_GOT_rbp
3638             // load (former) _ActiveMask
3639             "movq " AMASK6_2_0 ", %%mm7    \n\t" // _amask6_2_0 -> mm7
3640             // re-init address pointers and offset
3641 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3642                                                  // alignment boundary
3643             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
3644 // preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
3645             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3646 // preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
3647             RESTORE_rbp
3648 
3649             // prime the pump:  load the first Raw(x-bpp) data set
3650             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3651                                              // (we correct pos. in loop below)
3652          "avg_2lp:                       \n\t"
3653             "movq (%1," PCX ",), %%mm0   \n\t"
3654             "psrlq $48, %%mm2            \n\t" // shift data to pos. correctly
3655             "movq (%0," PCX ",), %%mm1   \n\t" //  (GRR BUGFIX:  was psllq)
3656             // add (Prev_row/2) to average
3657             "movq %%mm5, %%mm3           \n\t"
3658             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3659             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3660             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3661                                                // byte
3662             "movq %%mm7, %%mm6           \n\t"
3663             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3664                                                // each byte
3665 
3666             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3667             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3668                                                // LBCarrys
3669             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3670                                                // where both
3671                                                // lsb's were == 1 (only valid
3672                                                // for active group)
3673             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3674             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3675                                                // byte
3676             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3677                                                // for each byte
3678             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
3679                                                // bytes to add to Avg
3680             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3681                                                // for each Active byte
3682 
3683             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3684             "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
3685                                                // bytes 2 & 3
3686             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3687             "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
3688             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3689                                                // LBCarrys
3690             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3691                                                // where both
3692                                                // lsb's were == 1 (only valid
3693                                                // for active group)
3694             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3695             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3696                                                // byte
3697             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3698                                                // for each byte
3699             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3700                                                // bytes to add to Avg
3701             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3702                                                // Avg for each Active byte
3703 
3704             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3705             "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
3706                                                // bytes 4 & 5
3707             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3708             "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
3709             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3710                                                // LBCarrys
3711             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3712                                                // where both lsb's were == 1
3713                                                // (only valid for active group)
3714             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3715             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3716                                                // byte
3717             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3718                                                // for each byte
3719             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3720                                                // bytes to add to Avg
3721             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3722                                                // Avg for each Active byte
3723 
3724             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3725             "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
3726                                                // bytes 6 & 7
3727             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3728             "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
3729             "addl $8, %%ecx              \n\t"
3730             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3731                                                // LBCarrys
3732             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3733                                                // where both
3734                                                // lsb's were == 1 (only valid
3735                                                // for active group)
3736             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3737             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3738                                                // byte
3739             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3740                                                // for each byte
3741             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3742                                                // bytes to add to Avg
3743             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3744                                                // Avg for each Active byte
3745             "cmpl %%eax, %%ecx           \n\t" // MMXLength
3746             // now ready to write back to memory
3747             "movq %%mm0, -8(%1," PCX ",) \n\t"
3748             // prep Raw(x-bpp) for next loop
3749             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3750             "jb avg_2lp                  \n\t"
3751 
3752             : "=S" (dummy_value_S),            // output regs (dummy)
3753               "=D" (dummy_value_D),
3754               "=c" (dummy_value_c),
3755               "=a" (dummy_value_a)
3756 
3757             : "0" (prev_row),    // esi/rsi    // input regs
3758               "1" (row),         // edi/rdi
3759               "2" (diff),        // ecx
3760               "3" (MMXLength)    // eax
3761 
3762 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
3763             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3764             , "%mm4", "%mm5", "%mm6", "%mm7"
3765 #endif
3766          );
3767       }
3768       break;  // end 2 bpp
3769 
3770       case 6:   // formerly shared with 4 bpp case (see comments there)
3771       {
3772 //       _ShiftBpp = bpp << 3;        // 48 (psllq)
3773 //       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
3774 
3775          __asm__ __volatile__ (
3776             LOAD_GOT_rbp
3777             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3778             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
3779             // re-init address pointers and offset
3780 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3781                                                  // alignment boundary
3782             "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
3783             RESTORE_rbp
3784 
3785             // ... and clear all bytes except for 1st active group
3786 // preload  "movl  row, %1               \n\t" // edi:  Avg(x)
3787             "psrlq $16, %%mm7            \n\t"
3788 // preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
3789             "movq  %%mm7, %%mm6          \n\t"
3790             "psllq $48, %%mm6            \n\t" // mask for 2nd active group
3791 
3792             // prime the pump:  load the first Raw(x-bpp) data set
3793             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3794                                              // (we correct pos. in loop below)
3795          "avg_6lp:                       \n\t"
3796             "movq (%1," PCX ",), %%mm0   \n\t"
3797             "psrlq $16, %%mm2            \n\t" // shift data to pos. correctly
3798             "movq (%0," PCX ",), %%mm1   \n\t"
3799             // add (Prev_row/2) to average
3800             "movq %%mm5, %%mm3           \n\t"
3801             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3802             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3803             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3804                                                // byte
3805             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3806                                                // each byte
3807             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3808             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3809                                                // LBCarrys
3810             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3811                                                // where both
3812                               // lsb's were == 1 (only valid for active group)
3813             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3814             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3815                                                // byte
3816             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3817                                                // for each byte
3818             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
3819                                                // bytes to add to Avg
3820             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3821                                                // for each Active
3822                               // byte
3823             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3824             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3825             "psllq $48, %%mm2            \n\t" // shift data to pos. correctly
3826             "addl $8, %%ecx              \n\t"
3827             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3828                                                // LBCarrys
3829             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3830                                                // where both
3831                               // lsb's were == 1 (only valid for active group)
3832             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3833             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3834                                                // byte
3835             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3836                                                // for each byte
3837             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3838                                                // bytes to add to Avg
3839             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3840                                                // Avg for each Active byte
3841             "cmpl %%eax, %%ecx           \n\t" // MMXLength
3842             // now ready to write back to memory
3843             "movq %%mm0, -8(%1," PCX ",) \n\t"
3844             // prep Raw(x-bpp) for next loop
3845             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3846             "jb avg_6lp                  \n\t"
3847 
3848             : "=S" (dummy_value_S),            // output regs (dummy)
3849               "=D" (dummy_value_D),
3850               "=c" (dummy_value_c),
3851               "=a" (dummy_value_a)
3852 
3853             : "0" (prev_row),    // esi/rsi    // input regs
3854               "1" (row),         // edi/rdi
3855               "2" (diff),        // ecx
3856               "3" (MMXLength)    // eax
3857 
3858 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
3859             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3860             , "%mm4", "%mm5", "%mm6", "%mm7"
3861 #endif
3862          );
3863       }
3864       break;  // end 6 bpp
3865 
3866       case 8:
3867       {
3868          __asm__ __volatile__ (
3869             // re-init address pointers and offset
3870 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3871                                                  // alignment boundary
3872             LOAD_GOT_rbp
3873             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
3874 // preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
3875             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3876 // preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
3877             RESTORE_rbp
3878 
3879             // prime the pump:  load the first Raw(x-bpp) data set
3880             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3881                                       // (NO NEED to correct pos. in loop below)
3882 
3883          "avg_8lp:                       \n\t"
3884             "movq (%1," PCX ",), %%mm0   \n\t"
3885             "movq %%mm5, %%mm3           \n\t"
3886             "movq (%0," PCX ",), %%mm1   \n\t"
3887             "addl $8, %%ecx              \n\t"
3888             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3889             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3890             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3891                                                //  where both lsb's were == 1
3892             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3893             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3894             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3895             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3896             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3897             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3898             "cmpl %%eax, %%ecx           \n\t" // MMXLength
3899             "movq %%mm0, -8(%1," PCX ",) \n\t"
3900             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3901             "jb avg_8lp                  \n\t"
3902 
3903             : "=S" (dummy_value_S),            // output regs (dummy)
3904               "=D" (dummy_value_D),
3905               "=c" (dummy_value_c),
3906               "=a" (dummy_value_a)
3907 
3908             : "0" (prev_row),    // esi/rsi    // input regs
3909               "1" (row),         // edi/rdi
3910               "2" (diff),        // ecx
3911               "3" (MMXLength)    // eax
3912 
3913 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
3914             : "%mm0", "%mm1", "%mm2"           // clobber list
3915             , "%mm3", "%mm4", "%mm5"
3916 #endif
3917          );
3918       }
3919       break;  // end 8 bpp
3920 
3921       default:                // bpp != 1,2,3,4,6,8:  doesn't exist
3922       {
3923          // ERROR:  SHOULD NEVER BE REACHED
3924 #if defined(PNG_DEBUG)
3925          png_debug(1, "Internal libpng logic error (GCC "
3926            "png_read_filter_row_mmx_avg())\n");
3927 #endif
3928       }
3929       break;
3930 
3931    } // end switch (bpp)
3932 
3933    __asm__ __volatile__ (
3934       // MMX acceleration complete; now do clean-up
3935       // check if any remaining bytes left to decode
3936 //pre "movl FullLength, %%edx      \n\t"
3937 //pre "movl MMXLength, %%eax       \n\t" // eax:  x == offset bytes after MMX
3938 //pre "movl row, %2                \n\t" // edi:  Avg(x)
3939       "cmpl %%edx, %%eax           \n\t" // test if offset at end of array
3940       "jnb avg_end                 \n\t"
3941 
3942       SAVE_ebp
3943 
3944       // do Avg decode for remaining bytes
3945 //pre "movl prev_row, %1           \n\t" // esi:  Prior(x)
3946       "mov  %2, " PBP "            \n\t" // copy of row pointer...
3947 //pre "subl bpp, " PBP "           \n\t" // (bpp is preloaded into ecx)
3948       "sub  " PCX "," PBP "        \n\t" // ebp:  Raw(x-bpp)
3949       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3950 
3951       SAVE_GOT_ebx
3952 
3953    "avg_lp2:                       \n\t"
3954       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3955       "xorl %%ebx, %%ebx           \n\t"
3956       "movb (%1," PAX ",), %%cl    \n\t" // load cl with Prior(x)
3957       "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp)
3958       "addw %%cx, %%bx             \n\t"
3959       "incl %%eax                  \n\t"
3960       "shrw %%bx                   \n\t" // divide by 2
3961       "addb -1(%2," PAX ",), %%bl  \n\t" // add Avg(x); -1 to offset inc eax
3962       "cmpl %%edx, %%eax           \n\t" // check if at end of array
3963       "movb %%bl, -1(%2," PAX ",)  \n\t" // write back Raw(x) [mov does not
3964       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc eax]
3965 
3966       RESTORE_GOT_ebx
3967       RESTORE_ebp
3968 
3969    "avg_end:                       \n\t"
3970       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3971 
3972       : "=c" (dummy_value_c),            // output regs (dummy)
3973         "=S" (dummy_value_S),
3974         "=D" (dummy_value_D),
3975         "=a" (dummy_value_a),
3976         "=d" (dummy_value_d)
3977 
3978       : "0" (bpp),         // ecx        // input regs
3979         "1" (prev_row),    // esi/rsi
3980         "2" (row),         // edi/rdi
3981         "3" (MMXLength),   // eax
3982         "4" (FullLength)   // edx
3983 
3984       CLOB_COLON_ebx_ebp                 // clobber list
3985         CLOBBER_GOT_ebx
3986         CLOB_COMMA_ebx_ebp
3987         CLOBBER_ebp
3988    );
3989 
3990 } /* end png_read_filter_row_mmx_avg() */
3991 
3992 #endif /* PNG_MMX_READ_FILTER_AVG_SUPPORTED */
3993 
3994 
3995 
3996 #if defined(PNG_MMX_READ_FILTER_PAETH_SUPPORTED)
3997 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
3998 
3999 //===========================================================================//
4000 //                                                                           //
4001 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
4002 //                                                                           //
4003 //===========================================================================//
4004 
4005 // Optimized code for PNG Paeth filter decoder
4006 
4007 static void /* PRIVATE */
png_read_filter_row_mmx_paeth(png_row_infop row_info,png_bytep row,png_bytep prev_row)4008 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
4009                               png_bytep prev_row)
4010 {
4011    unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
4012    int bpp;
4013    int dummy_value_a;
4014    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
4015    int dummy_value_d;
4016    png_charp dummy_value_S;
4017    png_charp dummy_value_D;
4018    int diff; //     __attribute__((used));
4019 
4020    bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
4021    FullLength = row_info->rowbytes;         // number of bytes to filter
4022 
4023    __asm__ __volatile__ (
4024       SAVE_GOT_ebx
4025       SAVE_r15
4026       SAVE_ebp
4027 //pre "movl row, %2                \n\t" // edi/rdi
4028       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
4029 //pre "movl prev_row, %1           \n\t" // esi/rsi
4030       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
4031 //pre "movl FullLength, %%eax      \n\t" // bring in via eax...
4032       SAVE_FullLength                    // ...but store for later use
4033       "xorl %%eax, %%eax           \n\t"
4034 
4035       // Compute the Raw value for the first bpp bytes
4036       // Note: the formula works out to be always
4037       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
4038    "paeth_rlp:                     \n\t"
4039       "movb (%2," PBX ",), %%al    \n\t"
4040       "addb (%1," PBX ",), %%al    \n\t"
4041       "incl %%ebx                  \n\t"
4042 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
4043       "cmpl %%ecx, %%ebx           \n\t"
4044       "movb %%al, -1(%2," PBX ",)  \n\t"
4045       "jb paeth_rlp                \n\t"
4046 
4047       // get # of bytes to alignment (note:  computing _delta_ of two pointers,
4048       // so hereafter %%ebp is sufficient even on 64-bit)
4049       "mov  %2, " PBP "            \n\t" // take start of row
4050       "add  " PBX "," PBP "        \n\t" // add bpp
4051       "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
4052 //    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
4053       CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
4054       "sub  %2, " PBP "            \n\t" // subtract row ptr again => ebp =
4055       "jz paeth_go                 \n\t" //  target value of ebx at alignment
4056 
4057       "xorl %%ecx, %%ecx           \n\t"
4058 
4059       SAVE_r11_r12_r13
4060 
4061       // fix alignment
4062    "paeth_lp1:                     \n\t"
4063       "xorl %%eax, %%eax           \n\t"
4064       // pav = p - a = (a + b - c) - a = b - c
4065       "movb (%1," PBX ",), %%al    \n\t" // load Prior(x) into al
4066       "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4067       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4068       "movl %%eax, " pa_TEMP "     \n\t" // Save pav for later use
4069       "xorl %%eax, %%eax           \n\t"
4070       // pbv = p - b = (a + b - c) - b = a - c
4071       "movb (%2," PDX ",), %%al    \n\t" // load Raw(x-bpp) into al
4072       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4073       "movl %%eax, %%ecx           \n\t"
4074       // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
4075       "addl " pa_TEMP ", %%eax     \n\t" // pcv = pav + pbv
4076       // pc = abs(pcv)
4077       "testl $0x80000000, %%eax    \n\t"
4078       "jz paeth_pca                \n\t"
4079       "negl %%eax                  \n\t" // reverse sign of neg values
4080 
4081    "paeth_pca:                     \n\t"
4082       "movl %%eax, " pc_TEMP "     \n\t" // save pc for later use
4083       // pb = abs(pbv)
4084       "testl $0x80000000, %%ecx    \n\t"
4085       "jz paeth_pba                \n\t"
4086       "negl %%ecx                  \n\t" // reverse sign of neg values
4087 
4088    "paeth_pba:                     \n\t"
4089       "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
4090       // pa = abs(pav)
4091       "movl " pa_TEMP ", %%eax     \n\t"
4092       "testl $0x80000000, %%eax    \n\t"
4093       "jz paeth_paa                \n\t"
4094       "negl %%eax                  \n\t" // reverse sign of neg values
4095 
4096    "paeth_paa:                     \n\t"
4097       "movl %%eax, " pa_TEMP "     \n\t" // save pa for later use
4098       // test if pa <= pb
4099       "cmpl %%ecx, %%eax           \n\t"
4100       "jna paeth_abb               \n\t"
4101       // pa > pb; now test if pb <= pc
4102       "cmpl " pc_TEMP ", %%ecx     \n\t"
4103       "jna paeth_bbc               \n\t"
4104       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4105       "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4106       "jmp paeth_paeth             \n\t"
4107 
4108    "paeth_bbc:                     \n\t"
4109       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4110       "movb (%1," PBX ",), %%cl    \n\t" // load Prior(x) into cl
4111       "jmp paeth_paeth             \n\t"
4112 
4113    "paeth_abb:                     \n\t"
4114       // pa <= pb; now test if pa <= pc
4115       "cmpl " pc_TEMP ", %%eax     \n\t"
4116       "jna paeth_abc               \n\t"
4117       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4118       "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4119       "jmp paeth_paeth             \n\t"
4120 
4121    "paeth_abc:                     \n\t"
4122       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4123       "movb (%2," PDX ",), %%cl    \n\t" // load Raw(x-bpp) into cl
4124 
4125    "paeth_paeth:                   \n\t"
4126       "incl %%ebx                  \n\t"
4127       "incl %%edx                  \n\t"
4128       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4129       "addb %%cl, -1(%2," PBX ",)  \n\t"
4130       "cmpl %%ebp, %%ebx           \n\t"
4131       "jb paeth_lp1                \n\t"
4132 
4133       RESTORE_r11_r12_r13
4134 
4135    "paeth_go:                      \n\t"
4136       RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
4137       "movl %%ecx, %%eax           \n\t"
4138       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
4139       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
4140       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
4141 //out "movl %%ecx, MMXLength       \n\t"
4142       "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
4143       RESTORE_ebp                        //  (could swap ebp and edx functions)
4144       RESTORE_r15
4145       RESTORE_GOT_ebx
4146 
4147       : "=c" (MMXLength),                // output regs
4148         "=S" (dummy_value_S),
4149         "=D" (dummy_value_D),
4150         "=a" (diff)
4151 
4152       : "0" (bpp),         // ecx        // input regs
4153         "1" (prev_row),    // esi/rsi
4154         "2" (row),         // edi/rdi
4155         "3" (FullLength)   // eax
4156 
4157       : "%edx"                           // clobber list
4158         _CLOBBER_r11_r12_r13
4159         _CLOBBER_r15
4160         _CLOBBER_ebp
4161         _CLOBBER_GOT_ebx
4162    );
4163 
4164    // now do the math for the rest of the row
4165    switch (bpp)
4166    {
4167       case 3:
4168       {
4169 //       _ShiftBpp = 24;    // == bpp * 8
4170 //       _ShiftRem = 40;    // == 64 - _ShiftBpp
4171 
4172          __asm__ __volatile__ (
4173             LOAD_GOT_rbp
4174 // preload  "movl diff, %%ecx            \n\t"
4175 // preload  "movl row, %1                \n\t" // edi/rdi
4176 // preload  "movl prev_row, %0           \n\t" // esi/rsi
4177             "pxor %%mm0, %%mm0           \n\t"
4178 
4179             // prime the pump:  load the first Raw(x-bpp) data set
4180             "movq -8(%1," PCX ",), %%mm1 \n\t"
4181          "paeth_3lp:                     \n\t"
4182             "psrlq $40, %%mm1            \n\t" // shift last 3 bytes to 1st
4183                                                // 3 bytes
4184             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4185             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4186             "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
4187             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4188             "psrlq $40, %%mm3            \n\t" // shift last 3 bytes to 1st
4189                                                // 3 bytes
4190             // pav = p - a = (a + b - c) - a = b - c
4191             "movq %%mm2, %%mm4           \n\t"
4192             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4193             // pbv = p - b = (a + b - c) - b = a - c
4194             "movq %%mm1, %%mm5           \n\t"
4195             "psubw %%mm3, %%mm4          \n\t"
4196             "pxor %%mm7, %%mm7           \n\t"
4197             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4198             "movq %%mm4, %%mm6           \n\t"
4199             "psubw %%mm3, %%mm5          \n\t"
4200 
4201             // pa = abs(p-a) = abs(pav)
4202             // pb = abs(p-b) = abs(pbv)
4203             // pc = abs(p-c) = abs(pcv)
4204             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4205             "paddw %%mm5, %%mm6          \n\t"
4206             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4207             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4208             "psubw %%mm0, %%mm4          \n\t"
4209             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4210             "psubw %%mm0, %%mm4          \n\t"
4211             "psubw %%mm7, %%mm5          \n\t"
4212             "pxor %%mm0, %%mm0           \n\t"
4213             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4214             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4215             "psubw %%mm7, %%mm5          \n\t"
4216             "psubw %%mm0, %%mm6          \n\t"
4217             //  test pa <= pb
4218             "movq %%mm4, %%mm7           \n\t"
4219             "psubw %%mm0, %%mm6          \n\t"
4220             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4221             "movq %%mm7, %%mm0           \n\t"
4222             // use mm7 mask to merge pa & pb
4223             "pand %%mm7, %%mm5           \n\t"
4224             // use mm0 mask copy to merge a & b
4225             "pand %%mm0, %%mm2           \n\t"
4226             "pandn %%mm4, %%mm7          \n\t"
4227             "pandn %%mm1, %%mm0          \n\t"
4228             "paddw %%mm5, %%mm7          \n\t"
4229             "paddw %%mm2, %%mm0          \n\t"
4230             //  test  ((pa <= pb)? pa:pb) <= pc
4231             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4232             "pxor %%mm1, %%mm1           \n\t"
4233             "pand %%mm7, %%mm3           \n\t"
4234             "pandn %%mm0, %%mm7          \n\t"
4235             "paddw %%mm3, %%mm7          \n\t"
4236             "pxor %%mm0, %%mm0           \n\t"
4237             "packuswb %%mm1, %%mm7       \n\t"
4238             "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
4239             "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
4240             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4241             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4242             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4243             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4244             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
4245                                                // Raw(x-bpp)
4246             // now do Paeth for 2nd set of bytes (3-5)
4247             "psrlq $24, %%mm2            \n\t" // load b=Prior(x) step 2
4248             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4249             "pxor %%mm7, %%mm7           \n\t"
4250             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4251             // pbv = p - b = (a + b - c) - b = a - c
4252             "movq %%mm1, %%mm5           \n\t"
4253             // pav = p - a = (a + b - c) - a = b - c
4254             "movq %%mm2, %%mm4           \n\t"
4255             "psubw %%mm3, %%mm5          \n\t"
4256             "psubw %%mm3, %%mm4          \n\t"
4257             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
4258             //       pav + pbv = pbv + pav
4259             "movq %%mm5, %%mm6           \n\t"
4260             "paddw %%mm4, %%mm6          \n\t"
4261 
4262             // pa = abs(p-a) = abs(pav)
4263             // pb = abs(p-b) = abs(pbv)
4264             // pc = abs(p-c) = abs(pcv)
4265             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
4266             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
4267             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
4268             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
4269             "psubw %%mm0, %%mm5          \n\t"
4270             "psubw %%mm7, %%mm4          \n\t"
4271             "psubw %%mm0, %%mm5          \n\t"
4272             "psubw %%mm7, %%mm4          \n\t"
4273             "pxor %%mm0, %%mm0           \n\t"
4274             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4275             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4276             "psubw %%mm0, %%mm6          \n\t"
4277             //  test pa <= pb
4278             "movq %%mm4, %%mm7           \n\t"
4279             "psubw %%mm0, %%mm6          \n\t"
4280             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4281             "movq %%mm7, %%mm0           \n\t"
4282             // use mm7 mask to merge pa & pb
4283             "pand %%mm7, %%mm5           \n\t"
4284             // use mm0 mask copy to merge a & b
4285             "pand %%mm0, %%mm2           \n\t"
4286             "pandn %%mm4, %%mm7          \n\t"
4287             "pandn %%mm1, %%mm0          \n\t"
4288             "paddw %%mm5, %%mm7          \n\t"
4289             "paddw %%mm2, %%mm0          \n\t"
4290             //  test  ((pa <= pb)? pa:pb) <= pc
4291             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4292             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4293             "pand %%mm7, %%mm3           \n\t"
4294             "pandn %%mm0, %%mm7          \n\t"
4295             "pxor %%mm1, %%mm1           \n\t"
4296             "paddw %%mm3, %%mm7          \n\t"
4297             "pxor %%mm0, %%mm0           \n\t"
4298             "packuswb %%mm1, %%mm7       \n\t"
4299             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
4300             "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
4301             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4302             "psllq $24, %%mm7            \n\t" // shift bytes to 2nd group of
4303                                                // 3 bytes
4304              // pav = p - a = (a + b - c) - a = b - c
4305             "movq %%mm2, %%mm4           \n\t"
4306             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4307             "psllq $24, %%mm3            \n\t" // load c=Prior(x-bpp) step 2
4308             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4309             "movq %%mm7, %%mm1           \n\t"
4310             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4311             "psllq $24, %%mm1            \n\t" // shift bytes (was _ShiftBpp)
4312                                     // now mm1 will be used as Raw(x-bpp)
4313             // now do Paeth for 3rd, and final, set of bytes (6-7)
4314             "pxor %%mm7, %%mm7           \n\t"
4315             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4316             "psubw %%mm3, %%mm4          \n\t"
4317             // pbv = p - b = (a + b - c) - b = a - c
4318             "movq %%mm1, %%mm5           \n\t"
4319             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4320             "movq %%mm4, %%mm6           \n\t"
4321             "psubw %%mm3, %%mm5          \n\t"
4322             "pxor %%mm0, %%mm0           \n\t"
4323             "paddw %%mm5, %%mm6          \n\t"
4324 
4325             // pa = abs(p-a) = abs(pav)
4326             // pb = abs(p-b) = abs(pbv)
4327             // pc = abs(p-c) = abs(pcv)
4328             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4329             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4330             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4331             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4332             "psubw %%mm0, %%mm4          \n\t"
4333             "psubw %%mm7, %%mm5          \n\t"
4334             "psubw %%mm0, %%mm4          \n\t"
4335             "psubw %%mm7, %%mm5          \n\t"
4336             "pxor %%mm0, %%mm0           \n\t"
4337             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4338             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4339             "psubw %%mm0, %%mm6          \n\t"
4340             //  test pa <= pb
4341             "movq %%mm4, %%mm7           \n\t"
4342             "psubw %%mm0, %%mm6          \n\t"
4343             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4344             "movq %%mm7, %%mm0           \n\t"
4345             // use mm0 mask copy to merge a & b
4346             "pand %%mm0, %%mm2           \n\t"
4347             // use mm7 mask to merge pa & pb
4348             "pand %%mm7, %%mm5           \n\t"
4349             "pandn %%mm1, %%mm0          \n\t"
4350             "pandn %%mm4, %%mm7          \n\t"
4351             "paddw %%mm2, %%mm0          \n\t"
4352             "paddw %%mm5, %%mm7          \n\t"
4353             //  test  ((pa <= pb)? pa:pb) <= pc
4354             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4355             "pand %%mm7, %%mm3           \n\t"
4356             "pandn %%mm0, %%mm7          \n\t"
4357             "paddw %%mm3, %%mm7          \n\t"
4358             "pxor %%mm1, %%mm1           \n\t"
4359             "packuswb %%mm7, %%mm1       \n\t"
4360             // step ecx to next set of 8 bytes and repeat loop til done
4361             "addl $8, %%ecx              \n\t"
4362             "pand " AMASK0_2_6 ", %%mm1  \n\t" // _amask0_2_6 (_ActiveMaskEnd)
4363             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
4364             "cmpl %%eax, %%ecx           \n\t" // MMXLength
4365             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
4366             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4367                                  // mm1 will be used as Raw(x-bpp) next loop
4368                            // mm3 ready to be used as Prior(x-bpp) next loop
4369             "jb paeth_3lp                \n\t"
4370             RESTORE_rbp
4371 
4372             : "=S" (dummy_value_S),            // output regs (dummy)
4373               "=D" (dummy_value_D),
4374               "=c" (dummy_value_c),
4375               "=a" (dummy_value_a)
4376 
4377             : "0" (prev_row),  // esi/rsi      // input regs
4378               "1" (row),       // edi/rdi
4379               "2" (diff),      // ecx
4380               "3" (MMXLength)  // eax
4381 
4382 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
4383             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4384             , "%mm4", "%mm5", "%mm6", "%mm7"
4385 #endif
4386          );
4387       }
4388       break;  // end 3 bpp
4389 
4390       case 4:
4391       {
4392          __asm__ __volatile__ (
4393 // preload  "movl diff, %%ecx            \n\t"
4394 // preload  "movl row, %1                \n\t" // edi/rdi
4395 // preload  "movl prev_row, %0           \n\t" // esi/rsi
4396             "pxor %%mm0, %%mm0           \n\t"
4397             // prime the pump:  load the first Raw(x-bpp) data set
4398             "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
4399                                                //  a=Raw(x-bpp) bytes
4400          "paeth_4lp:                     \n\t"
4401             // do first set of 4 bytes
4402             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4403             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4404             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4405             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4406             // pav = p - a = (a + b - c) - a = b - c
4407             "movq %%mm2, %%mm4           \n\t"
4408             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4409             // pbv = p - b = (a + b - c) - b = a - c
4410             "movq %%mm1, %%mm5           \n\t"
4411             "psubw %%mm3, %%mm4          \n\t"
4412             "pxor %%mm7, %%mm7           \n\t"
4413             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4414             "movq %%mm4, %%mm6           \n\t"
4415             "psubw %%mm3, %%mm5          \n\t"
4416             // pa = abs(p-a) = abs(pav)
4417             // pb = abs(p-b) = abs(pbv)
4418             // pc = abs(p-c) = abs(pcv)
4419             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4420             "paddw %%mm5, %%mm6          \n\t"
4421             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4422             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4423             "psubw %%mm0, %%mm4          \n\t"
4424             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4425             "psubw %%mm0, %%mm4          \n\t"
4426             "psubw %%mm7, %%mm5          \n\t"
4427             "pxor %%mm0, %%mm0           \n\t"
4428             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4429             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4430             "psubw %%mm7, %%mm5          \n\t"
4431             "psubw %%mm0, %%mm6          \n\t"
4432             //  test pa <= pb
4433             "movq %%mm4, %%mm7           \n\t"
4434             "psubw %%mm0, %%mm6          \n\t"
4435             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4436             "movq %%mm7, %%mm0           \n\t"
4437             // use mm7 mask to merge pa & pb
4438             "pand %%mm7, %%mm5           \n\t"
4439             // use mm0 mask copy to merge a & b
4440             "pand %%mm0, %%mm2           \n\t"
4441             "pandn %%mm4, %%mm7          \n\t"
4442             "pandn %%mm1, %%mm0          \n\t"
4443             "paddw %%mm5, %%mm7          \n\t"
4444             "paddw %%mm2, %%mm0          \n\t"
4445             //  test  ((pa <= pb)? pa:pb) <= pc
4446             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4447             "pxor %%mm1, %%mm1           \n\t"
4448             "pand %%mm7, %%mm3           \n\t"
4449             "pandn %%mm0, %%mm7          \n\t"
4450             "paddw %%mm3, %%mm7          \n\t"
4451             "pxor %%mm0, %%mm0           \n\t"
4452             "packuswb %%mm1, %%mm7       \n\t"
4453             "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
4454             LOAD_GOT_rbp
4455             "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
4456             RESTORE_rbp
4457             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4458             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4459             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4460             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4461             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
4462                                                // Raw(x-bpp)
4463             // do second set of 4 bytes
4464             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4465             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4466             // pav = p - a = (a + b - c) - a = b - c
4467             "movq %%mm2, %%mm4           \n\t"
4468             // pbv = p - b = (a + b - c) - b = a - c
4469             "movq %%mm1, %%mm5           \n\t"
4470             "psubw %%mm3, %%mm4          \n\t"
4471             "pxor %%mm7, %%mm7           \n\t"
4472             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4473             "movq %%mm4, %%mm6           \n\t"
4474             "psubw %%mm3, %%mm5          \n\t"
4475             // pa = abs(p-a) = abs(pav)
4476             // pb = abs(p-b) = abs(pbv)
4477             // pc = abs(p-c) = abs(pcv)
4478             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4479             "paddw %%mm5, %%mm6          \n\t"
4480             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4481             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4482             "psubw %%mm0, %%mm4          \n\t"
4483             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4484             "psubw %%mm0, %%mm4          \n\t"
4485             "psubw %%mm7, %%mm5          \n\t"
4486             "pxor %%mm0, %%mm0           \n\t"
4487             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4488             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4489             "psubw %%mm7, %%mm5          \n\t"
4490             "psubw %%mm0, %%mm6          \n\t"
4491             //  test pa <= pb
4492             "movq %%mm4, %%mm7           \n\t"
4493             "psubw %%mm0, %%mm6          \n\t"
4494             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4495             "movq %%mm7, %%mm0           \n\t"
4496             // use mm7 mask to merge pa & pb
4497             "pand %%mm7, %%mm5           \n\t"
4498             // use mm0 mask copy to merge a & b
4499             "pand %%mm0, %%mm2           \n\t"
4500             "pandn %%mm4, %%mm7          \n\t"
4501             "pandn %%mm1, %%mm0          \n\t"
4502             "paddw %%mm5, %%mm7          \n\t"
4503             "paddw %%mm2, %%mm0          \n\t"
4504             //  test  ((pa <= pb)? pa:pb) <= pc
4505             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4506             "pxor %%mm1, %%mm1           \n\t"
4507             "pand %%mm7, %%mm3           \n\t"
4508             "pandn %%mm0, %%mm7          \n\t"
4509             "pxor %%mm1, %%mm1           \n\t"
4510             "paddw %%mm3, %%mm7          \n\t"
4511             "pxor %%mm0, %%mm0           \n\t"
4512             // step ecx to next set of 8 bytes and repeat loop til done
4513             "addl $8, %%ecx              \n\t"
4514             "packuswb %%mm7, %%mm1       \n\t"
4515             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x)
4516             "cmpl %%eax, %%ecx           \n\t" // MMXLength
4517             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4518                                  // mm1 will be used as Raw(x-bpp) next loop
4519             "jb paeth_4lp                \n\t"
4520 
4521             : "=S" (dummy_value_S),            // output regs (dummy)
4522               "=D" (dummy_value_D),
4523               "=c" (dummy_value_c),
4524               "=a" (dummy_value_a)
4525 
4526             : "0" (prev_row),  // esi/rsi      // input regs
4527               "1" (row),       // edi/rdi
4528               "2" (diff),      // ecx
4529               "3" (MMXLength)  // eax
4530 
4531 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
4532             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4533             , "%mm4", "%mm5", "%mm6", "%mm7"
4534 #endif
4535          );
4536       }
4537       break;  // end 4 bpp
4538 
4539       case 1:
4540       case 2:
4541       {
4542          __asm__ __volatile__ (
4543 // preload  "movl diff, %%eax            \n\t" // eax: x = offset to align. bdry
4544 // preload  "movl FullLength, %%edx      \n\t"
4545             "cmpl %%edx, %%eax           \n\t"
4546             "jnb paeth_dend              \n\t"
4547 
4548             SAVE_ebp
4549 
4550 // preload  "movl row, %2                \n\t" // edi/rdi
4551             // do Paeth decode for remaining bytes
4552 // preload  "movl prev_row, %1           \n\t" // esi/rsi
4553             "movl %%eax, %%ebp           \n\t"
4554 // preload  "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
4555             "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
4556             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
4557 
4558             SAVE_GOT_ebx
4559             SAVE_r11_r12_r13
4560 
4561          "paeth_dlp:                     \n\t"
4562             "xorl %%ebx, %%ebx           \n\t"
4563             // pav = p - a = (a + b - c) - a = b - c
4564             "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
4565             "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4566             "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
4567             "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
4568             "xorl %%ebx, %%ebx           \n\t"
4569             // pbv = p - b = (a + b - c) - b = a - c
4570             "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
4571             "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
4572             "movl %%ebx, %%ecx           \n\t"
4573             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4574             "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
4575             // pc = abs(pcv)
4576             "testl $0x80000000, %%ebx    \n\t"
4577             "jz paeth_dpca               \n\t"
4578             "negl %%ebx                  \n\t" // reverse sign of neg values
4579 
4580          "paeth_dpca:                    \n\t"
4581             "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
4582             // pb = abs(pbv)
4583             "testl $0x80000000, %%ecx    \n\t"
4584             "jz paeth_dpba               \n\t"
4585             "negl %%ecx                  \n\t" // reverse sign of neg values
4586 
4587          "paeth_dpba:                    \n\t"
4588             "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
4589             // pa = abs(pav)
4590             "movl " pa_TEMP ", %%ebx     \n\t"
4591             "testl $0x80000000, %%ebx    \n\t"
4592             "jz paeth_dpaa               \n\t"
4593             "negl %%ebx                  \n\t" // reverse sign of neg values
4594 
4595          "paeth_dpaa:                    \n\t"
4596             "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
4597             // test if pa <= pb
4598             "cmpl %%ecx, %%ebx           \n\t"
4599             "jna paeth_dabb              \n\t"
4600             // pa > pb; now test if pb <= pc
4601             "cmpl " pc_TEMP ", %%ecx     \n\t"
4602             "jna paeth_dbbc              \n\t"
4603             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4604             "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4605             "jmp paeth_dpaeth            \n\t"
4606 
4607          "paeth_dbbc:                    \n\t"
4608             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4609             "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
4610             "jmp paeth_dpaeth            \n\t"
4611 
4612          "paeth_dabb:                    \n\t"
4613             // pa <= pb; now test if pa <= pc
4614             "cmpl " pc_TEMP ", %%ebx     \n\t"
4615             "jna paeth_dabc              \n\t"
4616             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4617             "movb (%1," PBP ",), %%cl   \n\t" // load Prior(x-bpp) into cl
4618             "jmp paeth_dpaeth            \n\t"
4619 
4620          "paeth_dabc:                    \n\t"
4621             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4622             "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
4623 
4624          "paeth_dpaeth:                  \n\t"
4625             "incl %%eax                  \n\t"
4626             "incl %%ebp                  \n\t"
4627             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4628             "addb %%cl, -1(%2," PAX ",)  \n\t"
4629             "cmpl %%edx, %%eax           \n\t" // check against FullLength
4630             "jb paeth_dlp                \n\t"
4631 
4632             RESTORE_r11_r12_r13
4633             RESTORE_GOT_ebx
4634             RESTORE_ebp
4635 
4636          "paeth_dend:                    \n\t"
4637 
4638             : "=c" (dummy_value_c),            // output regs (dummy)
4639               "=S" (dummy_value_S),
4640               "=D" (dummy_value_D),
4641               "=a" (dummy_value_a),
4642               "=d" (dummy_value_d)
4643 
4644             : "0" (bpp),         // ecx        // input regs
4645               "1" (prev_row),    // esi/rsi
4646               "2" (row),         // edi/rdi
4647               "3" (diff),        // eax
4648               "4" (FullLength)   // edx
4649 
4650             CLOB_COLON_ebx_ebp_r1X             // clobber list
4651               CLOBBER_GOT_ebx
4652               CLOB_COMMA_ebx_ebp
4653               CLOBBER_ebp
4654               CLOB_COMMA_ebX_r1X
4655               CLOBBER_r11_r12_r13
4656          );
4657       }
4658       return; // end 1 or 2 bpp (no need to go further with this one)
4659 
4660       case 6:
4661       {
4662 //       _ActiveMask2 = 0xffffffff00000000LL;  // NOT USED ("_amask_0_4_4")
4663 //       _ShiftBpp = 48;       // bpp << 3 == bpp * 8
4664 //       _ShiftRem = 16;       // 64 - _ShiftBpp
4665 
4666          __asm__ __volatile__ (
4667 // preload  "movl diff, %%ecx            \n\t"
4668 // preload  "movl row, %1                \n\t" // edi/rdi
4669 // preload  "movl prev_row, %0           \n\t" // esi/rsi
4670             // prime the pump:  load the first Raw(x-bpp) data set
4671             "movq -8(%1," PCX ",), %%mm1 \n\t"
4672             "pxor %%mm0, %%mm0           \n\t"
4673 
4674          "paeth_6lp:                     \n\t"
4675             // must shift to position Raw(x-bpp) data
4676             "psrlq $16, %%mm1            \n\t" // was _ShiftRem
4677             // do first set of 4 bytes
4678             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4679             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4680             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4681             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4682             // must shift to position Prior(x-bpp) data
4683             "psrlq $16, %%mm3            \n\t" // was _ShiftRem
4684             // pav = p - a = (a + b - c) - a = b - c
4685             "movq %%mm2, %%mm4           \n\t"
4686             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4687             // pbv = p - b = (a + b - c) - b = a - c
4688             "movq %%mm1, %%mm5           \n\t"
4689             "psubw %%mm3, %%mm4          \n\t"
4690             "pxor %%mm7, %%mm7           \n\t"
4691             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4692             "movq %%mm4, %%mm6           \n\t"
4693             "psubw %%mm3, %%mm5          \n\t"
4694             // pa = abs(p-a) = abs(pav)
4695             // pb = abs(p-b) = abs(pbv)
4696             // pc = abs(p-c) = abs(pcv)
4697             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4698             "paddw %%mm5, %%mm6          \n\t"
4699             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4700             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4701             "psubw %%mm0, %%mm4          \n\t"
4702             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4703             "psubw %%mm0, %%mm4          \n\t"
4704             "psubw %%mm7, %%mm5          \n\t"
4705             "pxor %%mm0, %%mm0           \n\t"
4706             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4707             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4708             "psubw %%mm7, %%mm5          \n\t"
4709             "psubw %%mm0, %%mm6          \n\t"
4710             //  test pa <= pb
4711             "movq %%mm4, %%mm7           \n\t"
4712             "psubw %%mm0, %%mm6          \n\t"
4713             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4714             "movq %%mm7, %%mm0           \n\t"
4715             // use mm7 mask to merge pa & pb
4716             "pand %%mm7, %%mm5           \n\t"
4717             // use mm0 mask copy to merge a & b
4718             "pand %%mm0, %%mm2           \n\t"
4719             "pandn %%mm4, %%mm7          \n\t"
4720             "pandn %%mm1, %%mm0          \n\t"
4721             "paddw %%mm5, %%mm7          \n\t"
4722             "paddw %%mm2, %%mm0          \n\t"
4723             //  test  ((pa <= pb)? pa:pb) <= pc
4724             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4725             "pxor %%mm1, %%mm1           \n\t"
4726             "pand %%mm7, %%mm3           \n\t"
4727             "pandn %%mm0, %%mm7          \n\t"
4728             "paddw %%mm3, %%mm7          \n\t"
4729             "pxor %%mm0, %%mm0           \n\t"
4730             "packuswb %%mm1, %%mm7       \n\t"
4731             "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
4732             LOAD_GOT_rbp
4733             "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
4734             RESTORE_rbp
4735             "psrlq $16, %%mm3            \n\t"
4736             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x) step 1
4737             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4738             "movq %%mm2, %%mm6           \n\t"
4739             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4740             "movq -8(%1," PCX ",), %%mm1 \n\t"
4741             "psllq $48, %%mm6            \n\t" // bpp * 8 = bits per pixel
4742             "movq %%mm7, %%mm5           \n\t"
4743             "psrlq $16, %%mm1            \n\t" // 64 - (bpp * 8) = remainder
4744             "por %%mm6, %%mm3            \n\t"
4745             "psllq $48, %%mm5            \n\t" // was _ShiftBpp
4746             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4747             "por %%mm5, %%mm1            \n\t"
4748             // do second set of 4 bytes
4749             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4750             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4751             // pav = p - a = (a + b - c) - a = b - c
4752             "movq %%mm2, %%mm4           \n\t"
4753             // pbv = p - b = (a + b - c) - b = a - c
4754             "movq %%mm1, %%mm5           \n\t"
4755             "psubw %%mm3, %%mm4          \n\t"
4756             "pxor %%mm7, %%mm7           \n\t"
4757             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4758             "movq %%mm4, %%mm6           \n\t"
4759             "psubw %%mm3, %%mm5          \n\t"
4760             // pa = abs(p-a) = abs(pav)
4761             // pb = abs(p-b) = abs(pbv)
4762             // pc = abs(p-c) = abs(pcv)
4763             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4764             "paddw %%mm5, %%mm6          \n\t"
4765             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4766             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4767             "psubw %%mm0, %%mm4          \n\t"
4768             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4769             "psubw %%mm0, %%mm4          \n\t"
4770             "psubw %%mm7, %%mm5          \n\t"
4771             "pxor %%mm0, %%mm0           \n\t"
4772             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4773             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4774             "psubw %%mm7, %%mm5          \n\t"
4775             "psubw %%mm0, %%mm6          \n\t"
4776             //  test pa <= pb
4777             "movq %%mm4, %%mm7           \n\t"
4778             "psubw %%mm0, %%mm6          \n\t"
4779             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4780             "movq %%mm7, %%mm0           \n\t"
4781             // use mm7 mask to merge pa & pb
4782             "pand %%mm7, %%mm5           \n\t"
4783             // use mm0 mask copy to merge a & b
4784             "pand %%mm0, %%mm2           \n\t"
4785             "pandn %%mm4, %%mm7          \n\t"
4786             "pandn %%mm1, %%mm0          \n\t"
4787             "paddw %%mm5, %%mm7          \n\t"
4788             "paddw %%mm2, %%mm0          \n\t"
4789             //  test  ((pa <= pb)? pa:pb) <= pc
4790             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4791             "pxor %%mm1, %%mm1           \n\t"
4792             "pand %%mm7, %%mm3           \n\t"
4793             "pandn %%mm0, %%mm7          \n\t"
4794             "pxor %%mm1, %%mm1           \n\t"
4795             "paddw %%mm3, %%mm7          \n\t"
4796             "pxor %%mm0, %%mm0           \n\t"
4797             // step ecx to next set of 8 bytes and repeat loop til done
4798             "addl $8, %%ecx              \n\t"
4799             "packuswb %%mm7, %%mm1       \n\t"
4800             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
4801             "cmpl %%eax, %%ecx           \n\t" // MMXLength
4802             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4803                                  // mm1 will be used as Raw(x-bpp) next loop
4804             "jb paeth_6lp                \n\t"
4805 
4806             : "=S" (dummy_value_S),            // output regs (dummy)
4807               "=D" (dummy_value_D),
4808               "=c" (dummy_value_c),
4809               "=a" (dummy_value_a)
4810 
4811             : "0" (prev_row),  // esi/rsi      // input regs
4812               "1" (row),       // edi/rdi
4813               "2" (diff),      // ecx
4814               "3" (MMXLength)  // eax
4815 
4816 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
4817             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4818             , "%mm4", "%mm5", "%mm6", "%mm7"
4819 #endif
4820          );
4821       }
4822       break;  // end 6 bpp
4823 
4824       case 8:                          // bpp == 8
4825       {
4826          __asm__ __volatile__ (
4827 // preload  "movl diff, %%ecx            \n\t"
4828 // preload  "movl row, %1                \n\t" // edi/rdi
4829 // preload  "movl prev_row, %0           \n\t" // esi/rsi
4830             "pxor %%mm0, %%mm0           \n\t"
4831             // prime the pump:  load the first Raw(x-bpp) data set
4832             "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
4833                                                //  a=Raw(x-bpp) bytes
4834          "paeth_8lp:                     \n\t"
4835             // do first set of 4 bytes
4836             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4837             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4838             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4839             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4840             // pav = p - a = (a + b - c) - a = b - c
4841             "movq %%mm2, %%mm4           \n\t"
4842             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4843             // pbv = p - b = (a + b - c) - b = a - c
4844             "movq %%mm1, %%mm5           \n\t"
4845             "psubw %%mm3, %%mm4          \n\t"
4846             "pxor %%mm7, %%mm7           \n\t"
4847             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4848             "movq %%mm4, %%mm6           \n\t"
4849             "psubw %%mm3, %%mm5          \n\t"
4850             // pa = abs(p-a) = abs(pav)
4851             // pb = abs(p-b) = abs(pbv)
4852             // pc = abs(p-c) = abs(pcv)
4853             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4854             "paddw %%mm5, %%mm6          \n\t"
4855             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4856             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4857             "psubw %%mm0, %%mm4          \n\t"
4858             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4859             "psubw %%mm0, %%mm4          \n\t"
4860             "psubw %%mm7, %%mm5          \n\t"
4861             "pxor %%mm0, %%mm0           \n\t"
4862             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4863             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4864             "psubw %%mm7, %%mm5          \n\t"
4865             "psubw %%mm0, %%mm6          \n\t"
4866             //  test pa <= pb
4867             "movq %%mm4, %%mm7           \n\t"
4868             "psubw %%mm0, %%mm6          \n\t"
4869             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4870             "movq %%mm7, %%mm0           \n\t"
4871             // use mm7 mask to merge pa & pb
4872             "pand %%mm7, %%mm5           \n\t"
4873             // use mm0 mask copy to merge a & b
4874             "pand %%mm0, %%mm2           \n\t"
4875             "pandn %%mm4, %%mm7          \n\t"
4876             "pandn %%mm1, %%mm0          \n\t"
4877             "paddw %%mm5, %%mm7          \n\t"
4878             "paddw %%mm2, %%mm0          \n\t"
4879             //  test  ((pa <= pb)? pa:pb) <= pc
4880             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4881             "pxor %%mm1, %%mm1           \n\t"
4882             "pand %%mm7, %%mm3           \n\t"
4883             "pandn %%mm0, %%mm7          \n\t"
4884             "paddw %%mm3, %%mm7          \n\t"
4885             "pxor %%mm0, %%mm0           \n\t"
4886             "packuswb %%mm1, %%mm7       \n\t"
4887             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4888             LOAD_GOT_rbp
4889             "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
4890             RESTORE_rbp
4891             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4892             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4893             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4894             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4895             "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4896 
4897             // do second set of 4 bytes
4898             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4899             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4900             // pav = p - a = (a + b - c) - a = b - c
4901             "movq %%mm2, %%mm4           \n\t"
4902             // pbv = p - b = (a + b - c) - b = a - c
4903             "movq %%mm1, %%mm5           \n\t"
4904             "psubw %%mm3, %%mm4          \n\t"
4905             "pxor %%mm7, %%mm7           \n\t"
4906             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4907             "movq %%mm4, %%mm6           \n\t"
4908             "psubw %%mm3, %%mm5          \n\t"
4909             // pa = abs(p-a) = abs(pav)
4910             // pb = abs(p-b) = abs(pbv)
4911             // pc = abs(p-c) = abs(pcv)
4912             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4913             "paddw %%mm5, %%mm6          \n\t"
4914             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4915             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4916             "psubw %%mm0, %%mm4          \n\t"
4917             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4918             "psubw %%mm0, %%mm4          \n\t"
4919             "psubw %%mm7, %%mm5          \n\t"
4920             "pxor %%mm0, %%mm0           \n\t"
4921             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4922             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4923             "psubw %%mm7, %%mm5          \n\t"
4924             "psubw %%mm0, %%mm6          \n\t"
4925             //  test pa <= pb
4926             "movq %%mm4, %%mm7           \n\t"
4927             "psubw %%mm0, %%mm6          \n\t"
4928             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4929             "movq %%mm7, %%mm0           \n\t"
4930             // use mm7 mask to merge pa & pb
4931             "pand %%mm7, %%mm5           \n\t"
4932             // use mm0 mask copy to merge a & b
4933             "pand %%mm0, %%mm2           \n\t"
4934             "pandn %%mm4, %%mm7          \n\t"
4935             "pandn %%mm1, %%mm0          \n\t"
4936             "paddw %%mm5, %%mm7          \n\t"
4937             "paddw %%mm2, %%mm0          \n\t"
4938             //  test  ((pa <= pb)? pa:pb) <= pc
4939             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4940             "pxor %%mm1, %%mm1           \n\t"
4941             "pand %%mm7, %%mm3           \n\t"
4942             "pandn %%mm0, %%mm7          \n\t"
4943             "pxor %%mm1, %%mm1           \n\t"
4944             "paddw %%mm3, %%mm7          \n\t"
4945             "pxor %%mm0, %%mm0           \n\t"
4946             // step ecx to next set of 8 bytes and repeat loop til done
4947             "addl $8, %%ecx              \n\t"
4948             "packuswb %%mm7, %%mm1       \n\t"
4949             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
4950             "cmpl %%eax, %%ecx           \n\t" // MMXLength
4951             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4952                                  // mm1 will be used as Raw(x-bpp) next loop
4953             "jb paeth_8lp                \n\t"
4954 
4955             : "=S" (dummy_value_S),            // output regs (dummy)
4956               "=D" (dummy_value_D),
4957               "=c" (dummy_value_c),
4958               "=a" (dummy_value_a)
4959 
4960             : "0" (prev_row),  // esi/rsi      // input regs
4961               "1" (row),       // edi/rdi
4962               "2" (diff),      // ecx
4963               "3" (MMXLength)  // eax
4964 
4965 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
4966             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4967             , "%mm4", "%mm5", "%mm6", "%mm7"
4968 #endif
4969          );
4970       }
4971       break;  // end 8 bpp
4972 
4973       default:                // bpp != 1,2,3,4,6,8:  doesn't exist
4974       {
4975          // ERROR:  SHOULD NEVER BE REACHED
4976 #if defined(PNG_DEBUG)
4977          png_debug(1, "Internal libpng logic error (GCC "
4978            "png_read_filter_row_mmx_paeth())\n");
4979 #endif
4980       }
4981       break;
4982 
4983    } // end switch (bpp)
4984 
4985    __asm__ __volatile__ (
4986       // MMX acceleration complete; now do clean-up
4987       // check if any remaining bytes left to decode
4988 //pre "movl FullLength, %%edx      \n\t"
4989 //pre "movl MMXLength, %%eax       \n\t"
4990       "cmpl %%edx, %%eax           \n\t"
4991       "jnb paeth_end               \n\t"
4992 
4993       SAVE_ebp
4994 
4995 //pre "movl row, %2                \n\t" // edi/rdi
4996 //pre "movl prev_row, %1           \n\t" // esi/rsi
4997       // do Paeth decode for remaining bytes
4998       "movl %%eax, %%ebp           \n\t"
4999 //pre "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
5000       "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
5001       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
5002 
5003       SAVE_GOT_ebx
5004       SAVE_r11_r12_r13
5005 
5006    "paeth_lp2:                     \n\t"
5007       "xorl %%ebx, %%ebx           \n\t"
5008       // pav = p - a = (a + b - c) - a = b - c
5009       "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
5010       "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
5011       "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
5012       "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
5013       "xorl %%ebx, %%ebx           \n\t"
5014       // pbv = p - b = (a + b - c) - b = a - c
5015       "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
5016       "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
5017       "movl %%ebx, %%ecx           \n\t"
5018       // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
5019       "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
5020       // pc = abs(pcv)
5021       "testl $0x80000000, %%ebx    \n\t"
5022       "jz paeth_pca2               \n\t"
5023       "negl %%ebx                  \n\t" // reverse sign of neg values
5024 
5025    "paeth_pca2:                    \n\t"
5026       "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
5027       // pb = abs(pbv)
5028       "testl $0x80000000, %%ecx    \n\t"
5029       "jz paeth_pba2               \n\t"
5030       "negl %%ecx                  \n\t" // reverse sign of neg values
5031 
5032    "paeth_pba2:                    \n\t"
5033       "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
5034       // pa = abs(pav)
5035       "movl " pa_TEMP ", %%ebx     \n\t"
5036       "testl $0x80000000, %%ebx    \n\t"
5037       "jz paeth_paa2               \n\t"
5038       "negl %%ebx                  \n\t" // reverse sign of neg values
5039 
5040    "paeth_paa2:                    \n\t"
5041       "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
5042       // test if pa <= pb
5043       "cmpl %%ecx, %%ebx           \n\t"
5044       "jna paeth_abb2              \n\t"
5045       // pa > pb; now test if pb <= pc
5046       "cmpl " pc_TEMP ", %%ecx     \n\t"
5047       "jna paeth_bbc2              \n\t"
5048       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
5049       "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
5050       "jmp paeth_paeth2            \n\t"
5051 
5052    "paeth_bbc2:                    \n\t"
5053       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
5054       "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
5055       "jmp paeth_paeth2            \n\t"
5056 
5057    "paeth_abb2:                    \n\t"
5058       // pa <= pb; now test if pa <= pc
5059       "cmpl " pc_TEMP ", %%ebx     \n\t"
5060       "jna paeth_abc2              \n\t"
5061       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
5062       "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
5063       "jmp paeth_paeth2            \n\t"
5064 
5065    "paeth_abc2:                    \n\t"
5066       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
5067       "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
5068 
5069    "paeth_paeth2:                  \n\t"
5070       "incl %%eax                  \n\t"
5071       "incl %%ebp                  \n\t"
5072       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
5073       "addb %%cl, -1(%2," PAX ",)  \n\t"
5074       "cmpl %%edx, %%eax           \n\t" // check against FullLength
5075       "jb paeth_lp2                \n\t"
5076 
5077       RESTORE_r11_r12_r13
5078       RESTORE_GOT_ebx
5079       RESTORE_ebp
5080 
5081    "paeth_end:                     \n\t"
5082       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
5083 
5084       : "=c" (dummy_value_c),            // output regs (dummy)
5085         "=S" (dummy_value_S),
5086         "=D" (dummy_value_D),
5087         "=a" (dummy_value_a),
5088         "=d" (dummy_value_d)
5089 
5090       : "0" (bpp),         // ecx        // input regs
5091         "1" (prev_row),    // esi/rsi
5092         "2" (row),         // edi/rdi
5093         "3" (MMXLength),   // eax
5094         "4" (FullLength)   // edx
5095 
5096       CLOB_COLON_ebx_ebp_r1X             // clobber list
5097         CLOBBER_GOT_ebx
5098         CLOB_COMMA_ebx_ebp
5099         CLOBBER_ebp
5100         CLOB_COMMA_ebX_r1X
5101         CLOBBER_r11_r12_r13
5102    );
5103 
5104 } /* end png_read_filter_row_mmx_paeth() */
5105 
5106 #endif // PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK
5107 #endif /* PNG_MMX_READ_FILTER_PAETH_SUPPORTED */
5108 
5109 
5110 
5111 
5112 #if defined(PNG_MMX_READ_FILTER_SUB_SUPPORTED)
5113 
5114 //===========================================================================//
5115 //                                                                           //
5116 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
5117 //                                                                           //
5118 //===========================================================================//
5119 
5120 // Optimized code for PNG Sub filter decoder
5121 
5122 static void /* PRIVATE */
png_read_filter_row_mmx_sub(png_row_infop row_info,png_bytep row)5123 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
5124 {
5125    unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
5126    int bpp;
5127    int dummy_value_a;
5128    int dummy_value_c;
5129    int dummy_value_d;
5130    png_bytep dummy_value_D;
5131    int diff; //     __attribute__((used));
5132 
5133    bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
5134    FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
5135      // (why do we subtract off bpp?  not so in avg or paeth...)
5136 
5137    __asm__ __volatile__ (
5138       SAVE_r15
5139       SAVE_ebp
5140 //pre "movl row, %1                \n\t" // edi/rdi
5141       "mov  %1, " PSI "            \n\t" // lp = row
5142 //pre "movl bpp, %%ecx             \n\t"
5143       "add  " PCX ", %1            \n\t" // rp = row + bpp
5144 //pre "movl FullLength, %%eax      \n\t" // bring in via eax...
5145       SAVE_FullLength                    // ...but store for later use
5146 
5147       "xorl %%eax, %%eax           \n\t"
5148 
5149       // get # of bytes to alignment (note:  computing _delta_ of two pointers,
5150       // so hereafter %%ebp is sufficient even on 64-bit)
5151       "mov  %1, " PBP "            \n\t" // take start of row
5152       "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
5153 //    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
5154       CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
5155       "sub  %1, " PBP "            \n\t" // subtract row ptr again => ebp =
5156       "jz sub_go                   \n\t" //  target value of eax at alignment
5157 
5158    "sub_lp1:                       \n\t" // fix alignment
5159       "movb (" PSI "," PAX ",), %%cl \n\t"
5160       "addb %%cl, (%1," PAX ",)    \n\t"
5161       "incl %%eax                  \n\t"
5162       "cmpl %%ebp, %%eax           \n\t"
5163       "jb sub_lp1                  \n\t"
5164 
5165    "sub_go:                        \n\t"
5166       RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
5167       "movl %%ecx, %%edx           \n\t"
5168       "subl %%eax, %%edx           \n\t" // subtract alignment fix
5169       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
5170       "subl %%edx, %%ecx           \n\t" // drop over bytes from length
5171 //out "movl %%ecx, MMXLength       \n\t"
5172       "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
5173       RESTORE_ebp                        //  (could swap ebp and ecx functions,
5174       RESTORE_r15                        //  but %%cl issues...)
5175 
5176       : "=c" (MMXLength),       // 0     // output regs
5177         "=D" (dummy_value_D),   // 1
5178         "=a" (diff)             // 2
5179 
5180       : "0" (bpp),              // ecx   // input regs
5181         "1" (row),              // edi
5182         "2" (FullLength)        // eax
5183 
5184       : "%esi", "%edx"                   // clobber list
5185         _CLOBBER_r15
5186         _CLOBBER_ebp
5187    );
5188 
5189    // now do the math for the rest of the row
5190    switch (bpp)
5191    {
5192       case 3:
5193       {
5194 //       _ShiftBpp = 24;       // == 3 * 8
5195 //       _ShiftRem  = 40;      // == 64 - 24
5196 
5197          __asm__ __volatile__ (
5198 // preload  "mov  row, %1                 \n\t" // edi/rdi
5199             LOAD_GOT_rbp
5200             // load (former) _ActiveMask for 2nd active byte group
5201             "movq " AMASK2_3_3 ", %%mm7   \n\t" // _amask2_3_3
5202             RESTORE_rbp
5203 
5204 // notused  "mov  %1, " PSI "             \n\t" // lp = row
5205 // preload  "movl bpp, %%ecx              \n\t"
5206             "add  " PCX ", %1             \n\t" // rp = row + bpp
5207             "movq %%mm7, %%mm6            \n\t"
5208 // preload  "movl diff, %%edx             \n\t"
5209             "psllq $24, %%mm6             \n\t" // move mask in mm6 to cover
5210                                                 //  3rd active byte group
5211             // prime the pump:  load the first Raw(x-bpp) data set
5212             "movq -8(%1," PDX ",), %%mm1  \n\t"
5213 
5214          "sub_3lp:                        \n\t" // shift data for adding first
5215             "psrlq $40, %%mm1             \n\t" //  bpp bytes (no need for mask;
5216                                                 //  shift clears inactive bytes)
5217             // add 1st active group
5218             "movq (%1," PDX ",), %%mm0    \n\t"
5219             "paddb %%mm1, %%mm0           \n\t"
5220 
5221             // add 2nd active group
5222             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5223             "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
5224             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
5225             "paddb %%mm1, %%mm0           \n\t"
5226 
5227             // add 3rd active group
5228             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5229             "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
5230             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
5231             "addl $8, %%edx               \n\t"
5232             "paddb %%mm1, %%mm0           \n\t"
5233 
5234             "cmpl %%eax, %%edx            \n\t" // MMXLength
5235             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5236             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5237             "jb sub_3lp                   \n\t"
5238 
5239             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5240               "=D" (dummy_value_D),   // 1
5241               "=d" (dummy_value_d),   // 2
5242               "=a" (dummy_value_a)    // 3
5243 
5244             : "0" (bpp),              // ecx    // input regs
5245               "1" (row),              // edi
5246               "2" (diff),             // edx
5247               "3" (MMXLength)         // eax
5248 
5249 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
5250             : "%mm0", "%mm1", "%mm6", "%mm7"    // clobber list
5251 #endif
5252          );
5253       }
5254       break;  // end 3 bpp
5255 
5256       case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
5257       {         // but 64-bit PIC/.so problems (could still share, moving vars
5258                 // into unused MMX regs via ecx/edx, but kludgy)
5259 //       _ShiftBpp = bpp << 3;        // 32 (psllq)
5260 //       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
5261 
5262          __asm__ __volatile__ (
5263 // preload  "mov  row, %1                 \n\t" // edi/rdi
5264 // preload  "movl diff, %%edx             \n\t"
5265 // notused  "mov  %1, " PSI "             \n\t" // lp = row
5266 // preload  "movl bpp, %%ecx              \n\t"
5267             "add  " PCX ", %1             \n\t" // rp = row + bpp
5268 
5269             // prime the pump:  load the first Raw(x-bpp) data set
5270             "movq -8(%1," PDX ",), %%mm1  \n\t"
5271 
5272          "sub_4lp:                        \n\t" // shift data for adding first
5273             "psrlq $32, %%mm1             \n\t" //  bpp bytes (no need for mask;
5274                                                 //  shift clears inactive bytes)
5275             "movq (%1," PDX ",), %%mm0    \n\t"
5276             "paddb %%mm1, %%mm0           \n\t"
5277 
5278             // add 2nd active group
5279             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5280             "psllq $32, %%mm1             \n\t" // shift data to pos. correctly
5281             "addl $8, %%edx               \n\t"
5282             "paddb %%mm1, %%mm0           \n\t"
5283 
5284             "cmpl %%eax, %%edx            \n\t" // MMXLength
5285             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5286             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5287             "jb sub_4lp                   \n\t"
5288 
5289             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5290               "=D" (dummy_value_D),   // 1
5291               "=d" (dummy_value_d),   // 2
5292               "=a" (dummy_value_a)    // 3
5293 
5294             : "0" (bpp),              // ecx    // input regs
5295               "1" (row),              // edi
5296               "2" (diff),             // edx
5297               "3" (MMXLength)         // eax
5298 
5299 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
5300             : "%mm0", "%mm1"                    // clobber list
5301 #endif
5302          );
5303       }
5304       break;  // end 4 bpp
5305 
5306       case 1:
5307       {
5308          __asm__ __volatile__ (
5309 // preload  "movl diff, %%edx              \n\t"
5310 // preload  "mov  row, %1                  \n\t" // edi/rdi
5311 // preload  "cmpl FullLength, %%edx        \n\t"
5312             "cmpl %%eax, %%edx             \n\t"
5313             "jnb sub_1end                  \n\t"
5314             "mov  %1, " PSI "              \n\t" // lp = row
5315 // irrel.   "xorl %%ecx, %%ecx             \n\t" // (actually bug with preload)
5316 // preload  "movl bpp, %%ecx               \n\t"
5317             "add  " PCX ", %1              \n\t" // rp = row + bpp
5318 
5319          "sub_1lp:                         \n\t"
5320             "movb (" PSI "," PDX ",), %%cl \n\t"
5321             "addb %%cl, (%1," PDX ",)      \n\t"
5322             "incl %%edx                    \n\t"
5323             "cmpl %%eax, %%edx             \n\t" // compare with FullLength
5324             "jb sub_1lp                    \n\t"
5325 
5326          "sub_1end:                        \n\t"
5327 
5328             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5329               "=D" (dummy_value_D),   // 1
5330               "=d" (dummy_value_d),   // 2
5331               "=a" (dummy_value_a)    // 3
5332 
5333             : "0" (bpp),              // ecx    // input regs
5334               "1" (row),              // edi
5335               "2" (diff),             // edx
5336               "3" (FullLength)        // eax
5337 
5338             : "%esi"                            // clobber list
5339          );
5340       }
5341       return;  // end 1 bpp (bypassing cleanup block!)
5342 
5343       case 2:
5344       {
5345 //       _ShiftBpp = 16;       // == 2 * 8
5346 //       _ShiftRem = 48;       // == 64 - 16
5347 
5348          __asm__ __volatile__ (
5349             LOAD_GOT_rbp
5350             // load (former) _ActiveMask for 2nd active byte group
5351             "movq " AMASK4_2_2 ", %%mm7   \n\t" // _amask4_2_2
5352             RESTORE_rbp
5353 // preload  "movl diff, %%edx             \n\t"
5354             "movq %%mm7, %%mm6            \n\t"
5355 // preload  "mov  row, %1                 \n\t" // edi/rdi
5356             "psllq $16, %%mm6             \n\t" // move mask in mm6 to cover
5357                                                 //  3rd active byte group
5358 // notused  "mov  %1, " PSI "             \n\t" // lp = row
5359             "movq %%mm6, %%mm5            \n\t"
5360 // preload  "movl bpp, %%ecx              \n\t"
5361             "add  " PCX ", %1             \n\t" // rp = row + bpp
5362             "psllq $16, %%mm5             \n\t" // move mask in mm5 to cover
5363                                                 //  4th active byte group
5364             // prime the pump:  load the first Raw(x-bpp) data set
5365             "movq -8(%1," PDX ",), %%mm1  \n\t"
5366 
5367          "sub_2lp:                        \n\t" // shift data for adding first
5368             "psrlq $48, %%mm1             \n\t" //  bpp bytes (no need for mask;
5369                                                 //  shift clears inactive bytes)
5370             // add 1st active group
5371             "movq (%1," PDX ",), %%mm0    \n\t"
5372             "paddb %%mm1, %%mm0           \n\t"
5373 
5374             // add 2nd active group
5375             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5376             "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
5377             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
5378             "paddb %%mm1, %%mm0           \n\t"
5379 
5380             // add 3rd active group
5381             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5382             "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
5383             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
5384             "paddb %%mm1, %%mm0           \n\t"
5385 
5386             // add 4th active group
5387             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5388             "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
5389             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
5390             "addl $8, %%edx               \n\t"
5391             "paddb %%mm1, %%mm0           \n\t"
5392             "cmpl %%eax, %%edx            \n\t" // MMXLength
5393             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5394             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5395             "jb sub_2lp                   \n\t"
5396 
5397             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5398               "=D" (dummy_value_D),   // 1
5399               "=d" (dummy_value_d),   // 2
5400               "=a" (dummy_value_a)    // 3
5401 
5402             : "0" (bpp),              // ecx    // input regs
5403               "1" (row),              // edi
5404               "2" (diff),             // edx
5405               "3" (MMXLength)         // eax
5406 
5407 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
5408             : "%mm0", "%mm1", "%mm5", "%mm6"    // clobber list
5409             , "%mm7"
5410 #endif
5411          );
5412       }
5413       break;  // end 2 bpp
5414 
5415       case 6:   // formerly shared with 4 bpp case (see comments there)
5416       {
5417 //       _ShiftBpp = bpp << 3;        // 48 (psllq)
5418 //       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
5419 
5420          __asm__ __volatile__ (
5421 // preload  "mov  row, %1                 \n\t" // edi/rdi
5422 // preload  "movl diff, %%edx             \n\t"
5423 // notused  "mov  %1, " PSI "             \n\t" // lp = row
5424 // preload  "movl bpp, %%ecx              \n\t"
5425             "add  " PCX ", %1             \n\t" // rp = row + bpp
5426 
5427             // prime the pump:  load the first Raw(x-bpp) data set
5428             "movq -8(%1," PDX ",), %%mm1  \n\t"
5429 
5430          "sub_6lp:                        \n\t" // shift data for adding first
5431             "psrlq $16, %%mm1             \n\t" //  bpp bytes (no need for mask;
5432                                                 //  shift clears inactive bytes)
5433             "movq (%1," PDX ",), %%mm0    \n\t"
5434             "paddb %%mm1, %%mm0           \n\t"
5435 
5436             // add 2nd active group
5437             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5438             "psllq $48, %%mm1             \n\t" // shift data to pos. correctly
5439             "addl $8, %%edx               \n\t"
5440             "paddb %%mm1, %%mm0           \n\t"
5441 
5442             "cmpl %%eax, %%edx            \n\t" // MMXLength
5443             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5444             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5445             "jb sub_6lp                   \n\t"
5446 
5447             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5448               "=D" (dummy_value_D),   // 1
5449               "=d" (dummy_value_d),   // 2
5450               "=a" (dummy_value_a)    // 3
5451 
5452             : "0" (bpp),              // ecx    // input regs
5453               "1" (row),              // edi
5454               "2" (diff),             // edx
5455               "3" (MMXLength)         // eax
5456 
5457 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
5458             : "%mm0", "%mm1"                    // clobber list
5459 #endif
5460          );
5461       }
5462       break;  // end 6 bpp
5463 
5464       case 8:
5465       {
5466          __asm__ __volatile__ (
5467 // preload  "mov  row, %1                 \n\t" // edi/rdi
5468 // preload  "movl diff, %%edx             \n\t"
5469 // notused  "mov  %1, " PSI "             \n\t" // lp = row
5470 // preload  "movl bpp, %%ecx              \n\t"
5471             "add  " PCX ", %1             \n\t" // rp = row + bpp
5472 // preload  "movl MMXLength, %%eax        \n\t"
5473 
5474             // prime the pump:  load the first Raw(x-bpp) data set
5475             "movq -8(%1," PDX ",), %%mm7  \n\t"
5476             "movl %%eax, %%esi            \n\t" // copy of MMXLength -> esi
5477             "andl $0x0000003f, %%esi      \n\t" // calc bytes over mult of 64
5478 
5479          "sub_8lp:                        \n\t"
5480             "movq (%1," PDX ",), %%mm0    \n\t" // load Sub(x) for 1st 8 bytes
5481             "paddb %%mm7, %%mm0           \n\t"
5482             "movq 8(%1," PDX ",), %%mm1   \n\t" // load Sub(x) for 2nd 8 bytes
5483             "movq %%mm0, (%1," PDX ",)    \n\t" // write Raw(x) for 1st 8 bytes
5484 
5485             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
5486             // This will be repeated for each group of 8 bytes with the 8th
5487             // group being used as the Raw(x-bpp) for the 1st group of the
5488             // next loop.
5489 
5490             "paddb %%mm0, %%mm1           \n\t"
5491             "movq 16(%1," PDX ",), %%mm2  \n\t" // load Sub(x) for 3rd 8 bytes
5492             "movq %%mm1, 8(%1," PDX ",)   \n\t" // write Raw(x) for 2nd 8 bytes
5493             "paddb %%mm1, %%mm2           \n\t"
5494             "movq 24(%1," PDX ",), %%mm3  \n\t" // load Sub(x) for 4th 8 bytes
5495             "movq %%mm2, 16(%1," PDX ",)  \n\t" // write Raw(x) for 3rd 8 bytes
5496             "paddb %%mm2, %%mm3           \n\t"
5497             "movq 32(%1," PDX ",), %%mm4  \n\t" // load Sub(x) for 5th 8 bytes
5498             "movq %%mm3, 24(%1," PDX ",)  \n\t" // write Raw(x) for 4th 8 bytes
5499             "paddb %%mm3, %%mm4           \n\t"
5500             "movq 40(%1," PDX ",), %%mm5  \n\t" // load Sub(x) for 6th 8 bytes
5501             "movq %%mm4, 32(%1," PDX ",)  \n\t" // write Raw(x) for 5th 8 bytes
5502             "paddb %%mm4, %%mm5           \n\t"
5503             "movq 48(%1," PDX ",), %%mm6  \n\t" // load Sub(x) for 7th 8 bytes
5504             "movq %%mm5, 40(%1," PDX ",)  \n\t" // write Raw(x) for 6th 8 bytes
5505             "paddb %%mm5, %%mm6           \n\t"
5506             "movq 56(%1," PDX ",), %%mm7  \n\t" // load Sub(x) for 8th 8 bytes
5507             "movq %%mm6, 48(%1," PDX ",)  \n\t" // write Raw(x) for 7th 8 bytes
5508             "addl $64, %%edx              \n\t"
5509             "paddb %%mm6, %%mm7           \n\t"
5510             "cmpl %%esi, %%edx            \n\t" // cmp to bytes over mult of 64
5511             "movq %%mm7, -8(%1," PDX ",)  \n\t" // write Raw(x) for 8th 8 bytes
5512             "jb sub_8lp                   \n\t"
5513 
5514             "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
5515             "jnb sub_8lt8                 \n\t"
5516 
5517          "sub_8lpA:                       \n\t"
5518             "movq (%1," PDX ",), %%mm0    \n\t"
5519             "addl $8, %%edx               \n\t"
5520             "paddb %%mm7, %%mm0           \n\t"
5521             "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
5522             "movq %%mm0, -8(%1," PDX ",)  \n\t" // -8 to offset early addl edx
5523             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
5524             "jb sub_8lpA                  \n\t" //  to mm7 to be new Raw(x-bpp)
5525                                                 //  for next loop
5526          "sub_8lt8:                       \n\t"
5527 
5528             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5529               "=D" (dummy_value_D),   // 1
5530               "=d" (dummy_value_d),   // 2
5531               "=a" (dummy_value_a)    // 3
5532 
5533             : "0" (bpp),              // ecx    // input regs
5534               "1" (row),              // edi
5535               "2" (diff),             // edx
5536               "3" (MMXLength)         // eax
5537 
5538             : "%esi"                            // clobber list
5539 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
5540             , "%mm0", "%mm1", "%mm2", "%mm3"
5541             , "%mm4", "%mm5", "%mm6", "%mm7"
5542 #endif
5543          );
5544       }
5545       break;  // end 8 bpp
5546 
5547       default:                // bpp != 1,2,3,4,6,8:  doesn't exist
5548       {
5549          // ERROR:  SHOULD NEVER BE REACHED
5550 #if defined(PNG_DEBUG)
5551          png_debug(1, "Internal libpng logic error (GCC "
5552            "png_read_filter_row_mmx_sub())\n");
5553 #endif
5554       }
5555       break;
5556 
5557    } // end switch (bpp)
5558 
5559    __asm__ __volatile__ (
5560 //pre "movl MMXLength, %%eax         \n\t"
5561 //pre "mov  row, %1                  \n\t" // edi/rdi
5562 //pre "cmpl FullLength, %%eax        \n\t"
5563       "cmpl %%edx, %%eax             \n\t"
5564       "jnb sub_end                   \n\t"
5565 
5566       "mov  %1, " PSI "              \n\t" // lp = row
5567 //pre "movl bpp, %%ecx               \n\t"
5568       "add  " PCX ", %1              \n\t" // rp = row + bpp
5569       "xorl %%ecx, %%ecx             \n\t"
5570 
5571    "sub_lp2:                         \n\t"
5572       "movb (" PSI "," PAX ",), %%cl \n\t"
5573       "addb %%cl, (%1," PAX ",)      \n\t"
5574       "incl %%eax                    \n\t"
5575       "cmpl %%edx, %%eax             \n\t" // FullLength
5576       "jb sub_lp2                    \n\t"
5577 
5578    "sub_end:                         \n\t"
5579       "EMMS                          \n\t" // end MMX instructions
5580 
5581       : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5582         "=D" (dummy_value_D),   // 1
5583         "=a" (dummy_value_a),   // 2
5584         "=d" (dummy_value_d)    // 3
5585 
5586       : "0" (bpp),              // ecx    // input regs
5587         "1" (row),              // edi
5588         "2" (MMXLength),        // eax
5589         "3" (FullLength)        // edx
5590 
5591       : "%esi"                            // clobber list
5592    );
5593 
5594 } // end of png_read_filter_row_mmx_sub()
5595 
5596 #endif /* PNG_MMX_READ_FILTER_SUB_SUPPORTED */
5597 
5598 
5599 
5600 
5601 #if defined(PNG_MMX_READ_FILTER_UP_SUPPORTED)
5602 
5603 //===========================================================================//
5604 //                                                                           //
5605 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
5606 //                                                                           //
5607 //===========================================================================//
5608 
5609 // Optimized code for PNG Up filter decoder
5610 
5611 static void /* PRIVATE */
png_read_filter_row_mmx_up(png_row_infop row_info,png_bytep row,png_bytep prev_row)5612 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
5613                            png_bytep prev_row)
5614 {
5615    unsigned len;        // png_uint_32 is actually 64-bit on x86-64
5616    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
5617    png_bytep dummy_value_S;
5618    png_bytep dummy_value_D;
5619 
5620    len = row_info->rowbytes;              // number of bytes to filter
5621 
5622    __asm__ __volatile__ (
5623       SAVE_GOT_ebx
5624 //pre "mov  prev_row, %1           \n\t" // esi/rsi
5625 //pre "movl row, %2                \n\t" // edi/rdi
5626 
5627       "xorl %%ebx, %%ebx           \n\t"
5628       "xorl %%eax, %%eax           \n\t"
5629 
5630       // get # of bytes to alignment (note:  computing _delta_ of two pointers,
5631       // so hereafter %%ecx is sufficient even on 64-bit)
5632       "mov  %2, " PCX "            \n\t" // take start of row
5633       "add  $0x7, " PCX "          \n\t" // add 7 to incr past alignment bdry
5634 //    "andl $0xfffffff8, %%ecx     \n\t" // mask to alignment boundary (32-bit!)
5635       CLEAR_BOTTOM_3_BITS  PCX    "\n\t" // mask to alignment boundary
5636       "sub  %2, " PCX "            \n\t" // subtract row ptr again => ebp =
5637       "jz up_go                    \n\t" //  target value of ecx at alignment
5638 
5639    "up_lp1:                        \n\t" // fix alignment
5640       "movb (%2," PBX ",), %%al    \n\t"
5641       "addb (%1," PBX ",), %%al    \n\t"
5642       "incl %%ebx                  \n\t"
5643       "cmpl %%ecx, %%ebx           \n\t"
5644       "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
5645       "jb up_lp1                   \n\t" //  offset incl ebx
5646 
5647    "up_go:                         \n\t"
5648 //pre "movl len, %%edx             \n\t"
5649       "movl %%edx, %%ecx           \n\t"
5650       "subl %%ebx, %%edx           \n\t" // subtract alignment fix
5651       "andl $0x0000003f, %%edx     \n\t" // calc bytes over mult of 64
5652       "subl %%edx, %%ecx           \n\t" // sub over-bytes from original length
5653 
5654       // unrolled loop - use all MMX registers and interleave to reduce
5655       // number of branch instructions (loops) and reduce partial stalls
5656    "up_loop:                       \n\t"
5657       "movq (%1," PBX ",), %%mm1   \n\t"
5658       "movq (%2," PBX ",), %%mm0   \n\t"
5659       "movq 8(%1," PBX ",), %%mm3  \n\t"
5660       "paddb %%mm1, %%mm0          \n\t"
5661       "movq 8(%2," PBX ",), %%mm2  \n\t"
5662       "movq %%mm0, (%2," PBX ",)   \n\t"
5663       "paddb %%mm3, %%mm2          \n\t"
5664       "movq 16(%1," PBX ",), %%mm5 \n\t"
5665       "movq %%mm2, 8(%2," PBX ",)  \n\t"
5666       "movq 16(%2," PBX ",), %%mm4 \n\t"
5667       "movq 24(%1," PBX ",), %%mm7 \n\t"
5668       "paddb %%mm5, %%mm4          \n\t"
5669       "movq 24(%2," PBX ",), %%mm6 \n\t"
5670       "movq %%mm4, 16(%2," PBX ",) \n\t"
5671       "paddb %%mm7, %%mm6          \n\t"
5672       "movq 32(%1," PBX ",), %%mm1 \n\t"
5673       "movq %%mm6, 24(%2," PBX ",) \n\t"
5674       "movq 32(%2," PBX ",), %%mm0 \n\t"
5675       "movq 40(%1," PBX ",), %%mm3 \n\t"
5676       "paddb %%mm1, %%mm0          \n\t"
5677       "movq 40(%2," PBX ",), %%mm2 \n\t"
5678       "movq %%mm0, 32(%2," PBX ",) \n\t"
5679       "paddb %%mm3, %%mm2          \n\t"
5680       "movq 48(%1," PBX ",), %%mm5 \n\t"
5681       "movq %%mm2, 40(%2," PBX ",) \n\t"
5682       "movq 48(%2," PBX ",), %%mm4 \n\t"
5683       "movq 56(%1," PBX ",), %%mm7 \n\t"
5684       "paddb %%mm5, %%mm4          \n\t"
5685       "movq 56(%2," PBX ",), %%mm6 \n\t"
5686       "movq %%mm4, 48(%2," PBX ",) \n\t"
5687       "addl $64, %%ebx             \n\t"
5688       "paddb %%mm7, %%mm6          \n\t"
5689       "cmpl %%ecx, %%ebx           \n\t"
5690       "movq %%mm6, -8(%2," PBX ",) \n\t" // (+56)movq does not affect flags;
5691       "jb up_loop                  \n\t" //  -8 to offset addl ebx
5692 
5693       "cmpl $0, %%edx              \n\t" // test for bytes over mult of 64
5694       "jz up_end                   \n\t"
5695 
5696       "cmpl $8, %%edx              \n\t" // test for less than 8 bytes
5697       "jb up_lt8                   \n\t" //  [added by lcreeve at netins.net]
5698 
5699       "addl %%edx, %%ecx           \n\t"
5700       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
5701       "subl %%edx, %%ecx           \n\t" // drop over-bytes from length
5702       "jz up_lt8                   \n\t"
5703 
5704    "up_lpA:                        \n\t" // use MMX regs to update 8 bytes sim.
5705       "movq (%1," PBX ",), %%mm1   \n\t"
5706       "movq (%2," PBX ",), %%mm0   \n\t"
5707       "addl $8, %%ebx              \n\t"
5708       "paddb %%mm1, %%mm0          \n\t"
5709       "cmpl %%ecx, %%ebx           \n\t"
5710       "movq %%mm0, -8(%2," PBX ",) \n\t" // movq does not affect flags; -8 to
5711       "jb up_lpA                   \n\t" //  offset add ebx
5712       "cmpl $0, %%edx              \n\t" // test for bytes over mult of 8
5713       "jz up_end                   \n\t"
5714 
5715    "up_lt8:                        \n\t"
5716       "xorl %%eax, %%eax           \n\t"
5717       "addl %%edx, %%ecx           \n\t" // move over byte count into counter
5718 
5719    "up_lp2:                        \n\t" // use x86 regs for remaining bytes
5720       "movb (%2," PBX ",), %%al    \n\t"
5721       "addb (%1," PBX ",), %%al    \n\t"
5722       "incl %%ebx                  \n\t"
5723       "cmpl %%ecx, %%ebx           \n\t"
5724       "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
5725       "jb up_lp2                   \n\t" //  offset inc ebx
5726 
5727    "up_end:                        \n\t"
5728       "EMMS                        \n\t" // conversion of filtered row complete
5729       RESTORE_GOT_ebx
5730 
5731       : "=d" (dummy_value_d),   // 0     // output regs (dummy)
5732         "=S" (dummy_value_S),   // 1
5733         "=D" (dummy_value_D)    // 2
5734 
5735       : "0" (len),              // edx   // input regs
5736         "1" (prev_row),         // esi
5737         "2" (row)               // edi
5738 
5739       : "%eax", "%ecx"                   // clobber list (no input regs!)
5740         _CLOBBER_GOT_ebx
5741 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
5742       , "%mm0", "%mm1", "%mm2", "%mm3"
5743       , "%mm4", "%mm5", "%mm6", "%mm7"
5744 #endif
5745    );
5746 
5747 } // end of png_read_filter_row_mmx_up()
5748 
5749 #endif /* PNG_MMX_READ_FILTER_UP_SUPPORTED */
5750 
5751 
5752 
5753 
5754 /*===========================================================================*/
5755 /*                                                                           */
5756 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5757 /*                                                                           */
5758 /*===========================================================================*/
5759 
5760 /* Optimized png_read_filter_row routines */
5761 
5762 void /* PRIVATE */
png_read_filter_row(png_structp png_ptr,png_row_infop row_info,png_bytep row,png_bytep prev_row,int filter)5763 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5764    row, png_bytep prev_row, int filter)
5765 {
5766 #if defined(PNG_DEBUG)
5767    char filtname[10];
5768 #endif
5769 
5770    if (_mmx_supported == 2) {
5771 #if !defined(PNG_1_0_X)
5772        /* this should have happened in png_init_mmx_flags() already */
5773        png_warning(png_ptr, "asm_flags may not have been initialized");
5774 #endif
5775        png_mmx_support();
5776    }
5777 
5778 #if defined(PNG_DEBUG)
5779    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5780    switch (filter)
5781    {
5782       case 0:
5783          png_snprintf(filtname, 10, "none");
5784          break;
5785 
5786       case 1:
5787          png_snprintf(filtname, 10, "sub-%s",
5788 #ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
5789 #if !defined(PNG_1_0_X)
5790            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5791             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5792             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5793 #else
5794            _mmx_supported
5795 #endif
5796            ? "MMX" :
5797 #endif
5798            "C");
5799          break;
5800 
5801       case 2:
5802          png_snprintf(filtname, 10, "up-%s",
5803 #ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
5804 #if !defined(PNG_1_0_X)
5805            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5806             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5807             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5808 #else
5809            _mmx_supported
5810 #endif
5811            ? "MMX" :
5812 #endif
5813            "C");
5814          break;
5815 
5816       case 3:
5817          png_snprintf(filtname, 10, "avg-%s",
5818 #ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
5819 #if !defined(PNG_1_0_X)
5820            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5821             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5822             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5823 #else
5824            _mmx_supported
5825 #endif
5826            ? "MMX" :
5827 #endif
5828            "C");
5829          break;
5830 
5831       case 4:
5832          png_snprintf(filtname, 10, "paeth-%s",
5833 #ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
5834 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
5835 #if !defined(PNG_1_0_X)
5836            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5837             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5838             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5839 #else
5840            _mmx_supported
5841 #endif
5842            ? "MMX" :
5843 #endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
5844 #endif
5845            "C");
5846          break;
5847 
5848       default:
5849          png_snprintf(filtname, 10, "unknown");
5850          break;
5851    }
5852    png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname);
5853    //png_debug1(0, "png_ptr=%10p, ", png_ptr);
5854    //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags);
5855    png_debug1(0, "row=%10p, ", row);
5856    png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth,
5857       (int)((row_info->pixel_depth + 7) >> 3));
5858    png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes);
5859 #endif /* PNG_DEBUG */
5860 
5861    switch (filter)
5862    {
5863       case PNG_FILTER_VALUE_NONE:
5864          break;
5865 
5866       case PNG_FILTER_VALUE_SUB:
5867 #ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
5868 #if !defined(PNG_1_0_X)
5869          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5870              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5871              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5872 #else
5873          if (_mmx_supported)
5874 #endif
5875          {
5876             png_read_filter_row_mmx_sub(row_info, row);
5877          }
5878          else
5879 #endif
5880          {
5881             png_uint_32 i;
5882             png_uint_32 istop = row_info->rowbytes;
5883             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5884             png_bytep rp = row + bpp;
5885             png_bytep lp = row;
5886 
5887             for (i = bpp; i < istop; i++)
5888             {
5889                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5890                rp++;
5891             }
5892          }  /* end !UseMMX_sub */
5893          break;
5894 
5895       case PNG_FILTER_VALUE_UP:
5896 #ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
5897 #if !defined(PNG_1_0_X)
5898          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5899              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5900              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5901 #else
5902          if (_mmx_supported)
5903 #endif
5904          {
5905             png_read_filter_row_mmx_up(row_info, row, prev_row);
5906          }
5907           else
5908 #endif
5909          {
5910             png_uint_32 i;
5911             png_uint_32 istop = row_info->rowbytes;
5912             png_bytep rp = row;
5913             png_bytep pp = prev_row;
5914 
5915             for (i = 0; i < istop; ++i)
5916             {
5917                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5918                rp++;
5919             }
5920          }  /* end !UseMMX_up */
5921          break;
5922 
5923       case PNG_FILTER_VALUE_AVG:
5924 #ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
5925 #if !defined(PNG_1_0_X)
5926          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5927              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5928              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5929 #else
5930          if (_mmx_supported)
5931 #endif
5932          {
5933             png_read_filter_row_mmx_avg(row_info, row, prev_row);
5934          }
5935          else
5936 #endif
5937          {
5938             png_uint_32 i;
5939             png_bytep rp = row;
5940             png_bytep pp = prev_row;
5941             png_bytep lp = row;
5942             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5943             png_uint_32 istop = row_info->rowbytes - bpp;
5944 
5945             for (i = 0; i < bpp; i++)
5946             {
5947                *rp = (png_byte)(((int)(*rp) +
5948                   ((int)(*pp++) >> 1)) & 0xff);
5949                rp++;
5950             }
5951 
5952             for (i = 0; i < istop; i++)
5953             {
5954                *rp = (png_byte)(((int)(*rp) +
5955                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5956                rp++;
5957             }
5958          }  /* end !UseMMX_avg */
5959          break;
5960 
5961       case PNG_FILTER_VALUE_PAETH:
5962 #ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
5963 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
5964 #if !defined(PNG_1_0_X)
5965          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5966              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5967              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5968 #else
5969          if (_mmx_supported)
5970 #endif
5971          {
5972             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5973          }
5974          else
5975 #endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
5976 #endif
5977          {
5978             png_uint_32 i;
5979             png_bytep rp = row;
5980             png_bytep pp = prev_row;
5981             png_bytep lp = row;
5982             png_bytep cp = prev_row;
5983             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5984             png_uint_32 istop = row_info->rowbytes - bpp;
5985 
5986             for (i = 0; i < bpp; i++)
5987             {
5988                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5989                rp++;
5990             }
5991 
5992             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5993             {
5994                int a, b, c, pa, pb, pc, p;
5995 
5996                a = *lp++;
5997                b = *pp++;
5998                c = *cp++;
5999 
6000                p = b - c;
6001                pc = a - c;
6002 
6003 #if defined(PNG_USE_ABS)
6004                pa = abs(p);
6005                pb = abs(pc);
6006                pc = abs(p + pc);
6007 #else
6008                pa = p < 0 ? -p : p;
6009                pb = pc < 0 ? -pc : pc;
6010                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
6011 #endif
6012 
6013                /*
6014                   if (pa <= pb && pa <= pc)
6015                      p = a;
6016                   else if (pb <= pc)
6017                      p = b;
6018                   else
6019                      p = c;
6020                 */
6021 
6022                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
6023 
6024                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
6025                rp++;
6026             }
6027          }  /* end !UseMMX_paeth */
6028          break;
6029 
6030       default:
6031          png_warning(png_ptr, "Ignoring bad row-filter type");
6032          *row=0;
6033          break;
6034    }
6035 }
6036 
6037 #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
6038 
6039 
6040 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
6041 #endif /* __GNUC__ */
6042