• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jidctred.asm - reduced-size IDCT (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains inverse-DCT routines that produce reduced-size
17; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
18; The following code is based directly on the IJG's original jidctred.c;
19; see the jidctred.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%define CONST_BITS      13
29%define PASS1_BITS      2
30
31%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
32%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
33%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
34%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
35
36%if CONST_BITS == 13
37F_0_211 equ      1730           ; FIX(0.211164243)
38F_0_509 equ      4176           ; FIX(0.509795579)
39F_0_601 equ      4926           ; FIX(0.601344887)
40F_0_720 equ      5906           ; FIX(0.720959822)
41F_0_765 equ      6270           ; FIX(0.765366865)
42F_0_850 equ      6967           ; FIX(0.850430095)
43F_0_899 equ      7373           ; FIX(0.899976223)
44F_1_061 equ      8697           ; FIX(1.061594337)
45F_1_272 equ     10426           ; FIX(1.272758580)
46F_1_451 equ     11893           ; FIX(1.451774981)
47F_1_847 equ     15137           ; FIX(1.847759065)
48F_2_172 equ     17799           ; FIX(2.172734803)
49F_2_562 equ     20995           ; FIX(2.562915447)
50F_3_624 equ     29692           ; FIX(3.624509785)
51%else
52; NASM cannot do compile-time arithmetic on floating-point constants.
53%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
54F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
55F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
56F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
57F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
58F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
59F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
60F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
61F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
62F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
63F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
64F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
65F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
66F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
67F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
68%endif
69
70; --------------------------------------------------------------------------
71        SECTION SEG_CONST
72
73        alignz  16
74        global  EXTN(jconst_idct_red_mmx)
75
76EXTN(jconst_idct_red_mmx):
77
78PW_F184_MF076   times 2 dw  F_1_847,-F_0_765
79PW_F256_F089    times 2 dw  F_2_562, F_0_899
80PW_F106_MF217   times 2 dw  F_1_061,-F_2_172
81PW_MF060_MF050  times 2 dw -F_0_601,-F_0_509
82PW_F145_MF021   times 2 dw  F_1_451,-F_0_211
83PW_F362_MF127   times 2 dw  F_3_624,-F_1_272
84PW_F085_MF072   times 2 dw  F_0_850,-F_0_720
85PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4-1)
86PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4-1)
87PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2-1)
88PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2-1)
89PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
90
91        alignz  16
92
93; --------------------------------------------------------------------------
94        SECTION SEG_TEXT
95        BITS    32
96;
97; Perform dequantization and inverse DCT on one block of coefficients,
98; producing a reduced-size 4x4 output block.
99;
100; GLOBAL(void)
101; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
102;                     JSAMPARRAY output_buf, JDIMENSION output_col)
103;
104
105%define dct_table(b)    (b)+8           ; void *dct_table
106%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
107%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
108%define output_col(b)   (b)+20          ; JDIMENSION output_col
109
110%define original_ebp    ebp+0
111%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
112%define WK_NUM          2
113%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
114                                        ; JCOEF workspace[DCTSIZE2]
115
116        align   16
117        global  EXTN(jsimd_idct_4x4_mmx)
118
119EXTN(jsimd_idct_4x4_mmx):
120        push    ebp
121        mov     eax,esp                         ; eax = original ebp
122        sub     esp, byte 4
123        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
124        mov     [esp],eax
125        mov     ebp,esp                         ; ebp = aligned ebp
126        lea     esp, [workspace]
127        pushpic ebx
128;       push    ecx             ; need not be preserved
129;       push    edx             ; need not be preserved
130        push    esi
131        push    edi
132
133        get_GOT ebx             ; get GOT address
134
135        ; ---- Pass 1: process columns from input, store into work array.
136
137;       mov     eax, [original_ebp]
138        mov     edx, POINTER [dct_table(eax)]           ; quantptr
139        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
140        lea     edi, [workspace]                        ; JCOEF *wsptr
141        mov     ecx, DCTSIZE/4                          ; ctr
142        alignx  16,7
143.columnloop:
144%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
145        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
146        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
147        jnz     short .columnDCT
148
149        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
150        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
151        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
152        por     mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
153        por     mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
154        por     mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
155        por     mm0,mm1
156        packsswb mm0,mm0
157        movd    eax,mm0
158        test    eax,eax
159        jnz     short .columnDCT
160
161        ; -- AC terms all zero
162
163        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
164        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
165
166        psllw   mm0,PASS1_BITS
167
168        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
169        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
170        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
171
172        movq      mm1,mm0
173        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
174        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
175        movq      mm3,mm2
176        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
177        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
178
179        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
180        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
181        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
182        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
183        jmp     near .nextcolumn
184        alignx  16,7
185%endif
186.columnDCT:
187
188        ; -- Odd part
189
190        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
191        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
192        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
193        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
194        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
195        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
196        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
197        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
198
199        movq      mm4,mm0
200        movq      mm5,mm0
201        punpcklwd mm4,mm1
202        punpckhwd mm5,mm1
203        movq      mm0,mm4
204        movq      mm1,mm5
205        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
206        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
207        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
208        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
209
210        movq      mm6,mm2
211        movq      mm7,mm2
212        punpcklwd mm6,mm3
213        punpckhwd mm7,mm3
214        movq      mm2,mm6
215        movq      mm3,mm7
216        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
217        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
218        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
219        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
220
221        paddd   mm6,mm4                 ; mm6=tmp2L
222        paddd   mm7,mm5                 ; mm7=tmp2H
223        paddd   mm2,mm0                 ; mm2=tmp0L
224        paddd   mm3,mm1                 ; mm3=tmp0H
225
226        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
227        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
228
229        ; -- Even part
230
231        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
232        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
233        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
234        pmullw  mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
235        pmullw  mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
236        pmullw  mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
237
238        pxor      mm1,mm1
239        pxor      mm2,mm2
240        punpcklwd mm1,mm4               ; mm1=tmp0L
241        punpckhwd mm2,mm4               ; mm2=tmp0H
242        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
243        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
244
245        movq      mm3,mm5               ; mm5=in2=z2
246        punpcklwd mm5,mm0               ; mm0=in6=z3
247        punpckhwd mm3,mm0
248        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
249        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
250
251        movq    mm4,mm1
252        movq    mm0,mm2
253        paddd   mm1,mm5                 ; mm1=tmp10L
254        paddd   mm2,mm3                 ; mm2=tmp10H
255        psubd   mm4,mm5                 ; mm4=tmp12L
256        psubd   mm0,mm3                 ; mm0=tmp12H
257
258        ; -- Final output stage
259
260        movq    mm5,mm1
261        movq    mm3,mm2
262        paddd   mm1,mm6                 ; mm1=data0L
263        paddd   mm2,mm7                 ; mm2=data0H
264        psubd   mm5,mm6                 ; mm5=data3L
265        psubd   mm3,mm7                 ; mm3=data3H
266
267        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm6=[PD_DESCALE_P1_4]
268
269        paddd   mm1,mm6
270        paddd   mm2,mm6
271        psrad   mm1,DESCALE_P1_4
272        psrad   mm2,DESCALE_P1_4
273        paddd   mm5,mm6
274        paddd   mm3,mm6
275        psrad   mm5,DESCALE_P1_4
276        psrad   mm3,DESCALE_P1_4
277
278        packssdw  mm1,mm2               ; mm1=data0=(00 01 02 03)
279        packssdw  mm5,mm3               ; mm5=data3=(30 31 32 33)
280
281        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
282        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
283
284        movq    mm2,mm4
285        movq    mm3,mm0
286        paddd   mm4,mm7                 ; mm4=data1L
287        paddd   mm0,mm6                 ; mm0=data1H
288        psubd   mm2,mm7                 ; mm2=data2L
289        psubd   mm3,mm6                 ; mm3=data2H
290
291        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm7=[PD_DESCALE_P1_4]
292
293        paddd   mm4,mm7
294        paddd   mm0,mm7
295        psrad   mm4,DESCALE_P1_4
296        psrad   mm0,DESCALE_P1_4
297        paddd   mm2,mm7
298        paddd   mm3,mm7
299        psrad   mm2,DESCALE_P1_4
300        psrad   mm3,DESCALE_P1_4
301
302        packssdw  mm4,mm0               ; mm4=data1=(10 11 12 13)
303        packssdw  mm2,mm3               ; mm2=data2=(20 21 22 23)
304
305        movq      mm6,mm1               ; transpose coefficients(phase 1)
306        punpcklwd mm1,mm4               ; mm1=(00 10 01 11)
307        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
308        movq      mm7,mm2               ; transpose coefficients(phase 1)
309        punpcklwd mm2,mm5               ; mm2=(20 30 21 31)
310        punpckhwd mm7,mm5               ; mm7=(22 32 23 33)
311
312        movq      mm0,mm1               ; transpose coefficients(phase 2)
313        punpckldq mm1,mm2               ; mm1=(00 10 20 30)
314        punpckhdq mm0,mm2               ; mm0=(01 11 21 31)
315        movq      mm3,mm6               ; transpose coefficients(phase 2)
316        punpckldq mm6,mm7               ; mm6=(02 12 22 32)
317        punpckhdq mm3,mm7               ; mm3=(03 13 23 33)
318
319        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
320        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
321        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
322        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
323
324.nextcolumn:
325        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
326        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
327        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
328        dec     ecx                                     ; ctr
329        jnz     near .columnloop
330
331        ; ---- Pass 2: process rows from work array, store into output array.
332
333        mov     eax, [original_ebp]
334        lea     esi, [workspace]                        ; JCOEF *wsptr
335        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
336        mov     eax, JDIMENSION [output_col(eax)]
337
338        ; -- Odd part
339
340        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
341        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
342        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
343        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
344
345        movq      mm4,mm0
346        movq      mm5,mm0
347        punpcklwd mm4,mm1
348        punpckhwd mm5,mm1
349        movq      mm0,mm4
350        movq      mm1,mm5
351        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
352        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
353        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
354        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
355
356        movq      mm6,mm2
357        movq      mm7,mm2
358        punpcklwd mm6,mm3
359        punpckhwd mm7,mm3
360        movq      mm2,mm6
361        movq      mm3,mm7
362        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
363        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
364        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
365        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
366
367        paddd   mm6,mm4                 ; mm6=tmp2L
368        paddd   mm7,mm5                 ; mm7=tmp2H
369        paddd   mm2,mm0                 ; mm2=tmp0L
370        paddd   mm3,mm1                 ; mm3=tmp0H
371
372        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
373        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
374
375        ; -- Even part
376
377        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
378        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
379        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
380
381        pxor      mm1,mm1
382        pxor      mm2,mm2
383        punpcklwd mm1,mm4               ; mm1=tmp0L
384        punpckhwd mm2,mm4               ; mm2=tmp0H
385        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
386        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
387
388        movq      mm3,mm5               ; mm5=in2=z2
389        punpcklwd mm5,mm0               ; mm0=in6=z3
390        punpckhwd mm3,mm0
391        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
392        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
393
394        movq    mm4,mm1
395        movq    mm0,mm2
396        paddd   mm1,mm5                 ; mm1=tmp10L
397        paddd   mm2,mm3                 ; mm2=tmp10H
398        psubd   mm4,mm5                 ; mm4=tmp12L
399        psubd   mm0,mm3                 ; mm0=tmp12H
400
401        ; -- Final output stage
402
403        movq    mm5,mm1
404        movq    mm3,mm2
405        paddd   mm1,mm6                 ; mm1=data0L
406        paddd   mm2,mm7                 ; mm2=data0H
407        psubd   mm5,mm6                 ; mm5=data3L
408        psubd   mm3,mm7                 ; mm3=data3H
409
410        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm6=[PD_DESCALE_P2_4]
411
412        paddd   mm1,mm6
413        paddd   mm2,mm6
414        psrad   mm1,DESCALE_P2_4
415        psrad   mm2,DESCALE_P2_4
416        paddd   mm5,mm6
417        paddd   mm3,mm6
418        psrad   mm5,DESCALE_P2_4
419        psrad   mm3,DESCALE_P2_4
420
421        packssdw  mm1,mm2               ; mm1=data0=(00 10 20 30)
422        packssdw  mm5,mm3               ; mm5=data3=(03 13 23 33)
423
424        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
425        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
426
427        movq    mm2,mm4
428        movq    mm3,mm0
429        paddd   mm4,mm7                 ; mm4=data1L
430        paddd   mm0,mm6                 ; mm0=data1H
431        psubd   mm2,mm7                 ; mm2=data2L
432        psubd   mm3,mm6                 ; mm3=data2H
433
434        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm7=[PD_DESCALE_P2_4]
435
436        paddd   mm4,mm7
437        paddd   mm0,mm7
438        psrad   mm4,DESCALE_P2_4
439        psrad   mm0,DESCALE_P2_4
440        paddd   mm2,mm7
441        paddd   mm3,mm7
442        psrad   mm2,DESCALE_P2_4
443        psrad   mm3,DESCALE_P2_4
444
445        packssdw  mm4,mm0               ; mm4=data1=(01 11 21 31)
446        packssdw  mm2,mm3               ; mm2=data2=(02 12 22 32)
447
448        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
449
450        packsswb  mm1,mm2               ; mm1=(00 10 20 30 02 12 22 32)
451        packsswb  mm4,mm5               ; mm4=(01 11 21 31 03 13 23 33)
452        paddb     mm1,mm6
453        paddb     mm4,mm6
454
455        movq      mm7,mm1               ; transpose coefficients(phase 1)
456        punpcklbw mm1,mm4               ; mm1=(00 01 10 11 20 21 30 31)
457        punpckhbw mm7,mm4               ; mm7=(02 03 12 13 22 23 32 33)
458
459        movq      mm0,mm1               ; transpose coefficients(phase 2)
460        punpcklwd mm1,mm7               ; mm1=(00 01 02 03 10 11 12 13)
461        punpckhwd mm0,mm7               ; mm0=(20 21 22 23 30 31 32 33)
462
463        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
464        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
465        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
466        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
467
468        psrlq   mm1,4*BYTE_BIT
469        psrlq   mm0,4*BYTE_BIT
470
471        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
472        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
473        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
474        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
475
476        emms            ; empty MMX state
477
478        pop     edi
479        pop     esi
480;       pop     edx             ; need not be preserved
481;       pop     ecx             ; need not be preserved
482        poppic  ebx
483        mov     esp,ebp         ; esp <- aligned ebp
484        pop     esp             ; esp <- original ebp
485        pop     ebp
486        ret
487
488
489; --------------------------------------------------------------------------
490;
491; Perform dequantization and inverse DCT on one block of coefficients,
492; producing a reduced-size 2x2 output block.
493;
494; GLOBAL(void)
495; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
496;                     JSAMPARRAY output_buf, JDIMENSION output_col)
497;
498
499%define dct_table(b)    (b)+8           ; void *dct_table
500%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
501%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
502%define output_col(b)   (b)+20          ; JDIMENSION output_col
503
504        align   16
505        global  EXTN(jsimd_idct_2x2_mmx)
506
507EXTN(jsimd_idct_2x2_mmx):
508        push    ebp
509        mov     ebp,esp
510        push    ebx
511;       push    ecx             ; need not be preserved
512;       push    edx             ; need not be preserved
513        push    esi
514        push    edi
515
516        get_GOT ebx             ; get GOT address
517
518        ; ---- Pass 1: process columns from input.
519
520        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
521        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
522
523        ; | input:                  | result:        |
524        ; | 00 01 ** 03 ** 05 ** 07 |                |
525        ; | 10 11 ** 13 ** 15 ** 17 |                |
526        ; | ** ** ** ** ** ** ** ** |                |
527        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
528        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
529        ; | 50 51 ** 53 ** 55 ** 57 |                |
530        ; | ** ** ** ** ** ** ** ** |                |
531        ; | 70 71 ** 73 ** 75 ** 77 |                |
532
533        ; -- Odd part
534
535        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
536        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
537        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
538        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
539        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
540        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
541        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
542        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
543
544        ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
545        ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
546
547        pcmpeqd   mm7,mm7
548        pslld     mm7,WORD_BIT          ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
549
550        movq      mm4,mm0               ; mm4=(10 11 ** 13)
551        movq      mm5,mm2               ; mm5=(50 51 ** 53)
552        punpcklwd mm4,mm1               ; mm4=(10 30 11 31)
553        punpcklwd mm5,mm3               ; mm5=(50 70 51 71)
554        pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
555        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
556
557        psrld   mm0,WORD_BIT            ; mm0=(11 -- 13 --)
558        pand    mm1,mm7                 ; mm1=(-- 31 -- 33)
559        psrld   mm2,WORD_BIT            ; mm2=(51 -- 53 --)
560        pand    mm3,mm7                 ; mm3=(-- 71 -- 73)
561        por     mm0,mm1                 ; mm0=(11 31 13 33)
562        por     mm2,mm3                 ; mm2=(51 71 53 73)
563        pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
564        pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
565
566        paddd   mm4,mm5                 ; mm4=tmp0[col0 col1]
567
568        movq    mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
569        movq    mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
570        pmullw  mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
571        pmullw  mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
572        movq    mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
573        movq    mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
574        pmullw  mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
575        pmullw  mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
576
577        ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
578        ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
579
580        psrld   mm6,WORD_BIT            ; mm6=(15 -- 17 --)
581        pand    mm1,mm7                 ; mm1=(-- 35 -- 37)
582        psrld   mm3,WORD_BIT            ; mm3=(55 -- 57 --)
583        pand    mm5,mm7                 ; mm5=(-- 75 -- 77)
584        por     mm6,mm1                 ; mm6=(15 35 17 37)
585        por     mm3,mm5                 ; mm3=(55 75 57 77)
586        pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
587        pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
588
589        paddd   mm0,mm2                 ; mm0=tmp0[col1 col3]
590        paddd   mm6,mm3                 ; mm6=tmp0[col5 col7]
591
592        ; -- Even part
593
594        movq    mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
595        movq    mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
596        pmullw  mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
597        pmullw  mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
598
599        ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
600
601        movq    mm2,mm1                         ; mm2=(00 01 ** 03)
602        pslld   mm1,WORD_BIT                    ; mm1=(-- 00 -- **)
603        psrad   mm1,(WORD_BIT-CONST_BITS-2)     ; mm1=tmp10[col0 ****]
604
605        pand    mm2,mm7                         ; mm2=(-- 01 -- 03)
606        pand    mm5,mm7                         ; mm5=(-- 05 -- 07)
607        psrad   mm2,(WORD_BIT-CONST_BITS-2)     ; mm2=tmp10[col1 col3]
608        psrad   mm5,(WORD_BIT-CONST_BITS-2)     ; mm5=tmp10[col5 col7]
609
610        ; -- Final output stage
611
612        movq      mm3,mm1
613        paddd     mm1,mm4               ; mm1=data0[col0 ****]=(A0 **)
614        psubd     mm3,mm4               ; mm3=data1[col0 ****]=(B0 **)
615        punpckldq mm1,mm3               ; mm1=(A0 B0)
616
617        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]       ; mm7=[PD_DESCALE_P1_2]
618
619        movq    mm4,mm2
620        movq    mm3,mm5
621        paddd   mm2,mm0                 ; mm2=data0[col1 col3]=(A1 A3)
622        paddd   mm5,mm6                 ; mm5=data0[col5 col7]=(A5 A7)
623        psubd   mm4,mm0                 ; mm4=data1[col1 col3]=(B1 B3)
624        psubd   mm3,mm6                 ; mm3=data1[col5 col7]=(B5 B7)
625
626        paddd   mm1,mm7
627        psrad   mm1,DESCALE_P1_2
628
629        paddd   mm2,mm7
630        paddd   mm5,mm7
631        psrad   mm2,DESCALE_P1_2
632        psrad   mm5,DESCALE_P1_2
633        paddd   mm4,mm7
634        paddd   mm3,mm7
635        psrad   mm4,DESCALE_P1_2
636        psrad   mm3,DESCALE_P1_2
637
638        ; ---- Pass 2: process rows, store into output array.
639
640        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
641        mov     eax, JDIMENSION [output_col(ebp)]
642
643        ; | input:| result:|
644        ; | A0 B0 |        |
645        ; | A1 B1 | C0 C1  |
646        ; | A3 B3 | D0 D1  |
647        ; | A5 B5 |        |
648        ; | A7 B7 |        |
649
650        ; -- Odd part
651
652        packssdw  mm2,mm4               ; mm2=(A1 A3 B1 B3)
653        packssdw  mm5,mm3               ; mm5=(A5 A7 B5 B7)
654        pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
655        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
656
657        paddd     mm2,mm5               ; mm2=tmp0[row0 row1]
658
659        ; -- Even part
660
661        pslld     mm1,(CONST_BITS+2)    ; mm1=tmp10[row0 row1]
662
663        ; -- Final output stage
664
665        movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]     ; mm0=[PD_DESCALE_P2_2]
666
667        movq      mm6,mm1
668        paddd     mm1,mm2               ; mm1=data0[row0 row1]=(C0 C1)
669        psubd     mm6,mm2               ; mm6=data1[row0 row1]=(D0 D1)
670
671        paddd     mm1,mm0
672        paddd     mm6,mm0
673        psrad     mm1,DESCALE_P2_2
674        psrad     mm6,DESCALE_P2_2
675
676        movq      mm7,mm1               ; transpose coefficients
677        punpckldq mm1,mm6               ; mm1=(C0 D0)
678        punpckhdq mm7,mm6               ; mm7=(C1 D1)
679
680        packssdw  mm1,mm7               ; mm1=(C0 D0 C1 D1)
681        packsswb  mm1,mm1               ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
682        paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
683
684        movd    ecx,mm1
685        movd    ebx,mm1                 ; ebx=(C0 D0 C1 D1)
686        shr     ecx,2*BYTE_BIT          ; ecx=(C1 D1 -- --)
687
688        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
689        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
690        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
691        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
692
693        emms            ; empty MMX state
694
695        pop     edi
696        pop     esi
697;       pop     edx             ; need not be preserved
698;       pop     ecx             ; need not be preserved
699        pop     ebx
700        pop     ebp
701        ret
702
703; For some reason, the OS X linker does not honor the request to align the
704; segment unless we do this.
705        align   16
706