• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jidctred.asm - reduced-size IDCT (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains inverse-DCT routines that produce reduced-size
18; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
19; The following code is based directly on the IJG's original jidctred.c;
20; see the jidctred.c for more details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25%include "jdct.inc"
26
27; --------------------------------------------------------------------------
28
29%define CONST_BITS      13
30%define PASS1_BITS      2
31
32%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
33%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
34%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
35%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
36
37%if CONST_BITS == 13
38F_0_211 equ      1730           ; FIX(0.211164243)
39F_0_509 equ      4176           ; FIX(0.509795579)
40F_0_601 equ      4926           ; FIX(0.601344887)
41F_0_720 equ      5906           ; FIX(0.720959822)
42F_0_765 equ      6270           ; FIX(0.765366865)
43F_0_850 equ      6967           ; FIX(0.850430095)
44F_0_899 equ      7373           ; FIX(0.899976223)
45F_1_061 equ      8697           ; FIX(1.061594337)
46F_1_272 equ     10426           ; FIX(1.272758580)
47F_1_451 equ     11893           ; FIX(1.451774981)
48F_1_847 equ     15137           ; FIX(1.847759065)
49F_2_172 equ     17799           ; FIX(2.172734803)
50F_2_562 equ     20995           ; FIX(2.562915447)
51F_3_624 equ     29692           ; FIX(3.624509785)
52%else
53; NASM cannot do compile-time arithmetic on floating-point constants.
54%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
55F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
56F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
57F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
58F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
59F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
60F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
61F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
62F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
63F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
64F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
65F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
66F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
67F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
68F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
69%endif
70
71; --------------------------------------------------------------------------
72        SECTION SEG_CONST
73
74        alignz  16
75        global  EXTN(jconst_idct_red_mmx)
76
77EXTN(jconst_idct_red_mmx):
78
79PW_F184_MF076   times 2 dw  F_1_847,-F_0_765
80PW_F256_F089    times 2 dw  F_2_562, F_0_899
81PW_F106_MF217   times 2 dw  F_1_061,-F_2_172
82PW_MF060_MF050  times 2 dw -F_0_601,-F_0_509
83PW_F145_MF021   times 2 dw  F_1_451,-F_0_211
84PW_F362_MF127   times 2 dw  F_3_624,-F_1_272
85PW_F085_MF072   times 2 dw  F_0_850,-F_0_720
86PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4-1)
87PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4-1)
88PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2-1)
89PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2-1)
90PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
91
92        alignz  16
93
94; --------------------------------------------------------------------------
95        SECTION SEG_TEXT
96        BITS    32
97;
98; Perform dequantization and inverse DCT on one block of coefficients,
99; producing a reduced-size 4x4 output block.
100;
101; GLOBAL(void)
102; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
103;                     JSAMPARRAY output_buf, JDIMENSION output_col)
104;
105
106%define dct_table(b)    (b)+8           ; void * dct_table
107%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
108%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
109%define output_col(b)   (b)+20          ; JDIMENSION output_col
110
111%define original_ebp    ebp+0
112%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
113%define WK_NUM          2
114%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
115                                        ; JCOEF workspace[DCTSIZE2]
116
117        align   16
118        global  EXTN(jsimd_idct_4x4_mmx)
119
120EXTN(jsimd_idct_4x4_mmx):
121        push    ebp
122        mov     eax,esp                         ; eax = original ebp
123        sub     esp, byte 4
124        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
125        mov     [esp],eax
126        mov     ebp,esp                         ; ebp = aligned ebp
127        lea     esp, [workspace]
128        pushpic ebx
129;       push    ecx             ; need not be preserved
130;       push    edx             ; need not be preserved
131        push    esi
132        push    edi
133
134        get_GOT ebx             ; get GOT address
135
136        ; ---- Pass 1: process columns from input, store into work array.
137
138;       mov     eax, [original_ebp]
139        mov     edx, POINTER [dct_table(eax)]           ; quantptr
140        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
141        lea     edi, [workspace]                        ; JCOEF * wsptr
142        mov     ecx, DCTSIZE/4                          ; ctr
143        alignx  16,7
144.columnloop:
145%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
146        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
147        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
148        jnz     short .columnDCT
149
150        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
151        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
152        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
153        por     mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
154        por     mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
155        por     mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
156        por     mm0,mm1
157        packsswb mm0,mm0
158        movd    eax,mm0
159        test    eax,eax
160        jnz     short .columnDCT
161
162        ; -- AC terms all zero
163
164        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
165        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
166
167        psllw   mm0,PASS1_BITS
168
169        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
170        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
171        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
172
173        movq      mm1,mm0
174        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
175        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
176        movq      mm3,mm2
177        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
178        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
179
180        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
181        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
182        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
183        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
184        jmp     near .nextcolumn
185        alignx  16,7
186%endif
187.columnDCT:
188
189        ; -- Odd part
190
191        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
192        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
193        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
194        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
195        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
196        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
197        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
198        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
199
200        movq      mm4,mm0
201        movq      mm5,mm0
202        punpcklwd mm4,mm1
203        punpckhwd mm5,mm1
204        movq      mm0,mm4
205        movq      mm1,mm5
206        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
207        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
208        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
209        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
210
211        movq      mm6,mm2
212        movq      mm7,mm2
213        punpcklwd mm6,mm3
214        punpckhwd mm7,mm3
215        movq      mm2,mm6
216        movq      mm3,mm7
217        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
218        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
219        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
220        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
221
222        paddd   mm6,mm4                 ; mm6=tmp2L
223        paddd   mm7,mm5                 ; mm7=tmp2H
224        paddd   mm2,mm0                 ; mm2=tmp0L
225        paddd   mm3,mm1                 ; mm3=tmp0H
226
227        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
228        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
229
230        ; -- Even part
231
232        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
233        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
234        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
235        pmullw  mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
236        pmullw  mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
237        pmullw  mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
238
239        pxor      mm1,mm1
240        pxor      mm2,mm2
241        punpcklwd mm1,mm4               ; mm1=tmp0L
242        punpckhwd mm2,mm4               ; mm2=tmp0H
243        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
244        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
245
246        movq      mm3,mm5               ; mm5=in2=z2
247        punpcklwd mm5,mm0               ; mm0=in6=z3
248        punpckhwd mm3,mm0
249        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
250        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
251
252        movq    mm4,mm1
253        movq    mm0,mm2
254        paddd   mm1,mm5                 ; mm1=tmp10L
255        paddd   mm2,mm3                 ; mm2=tmp10H
256        psubd   mm4,mm5                 ; mm4=tmp12L
257        psubd   mm0,mm3                 ; mm0=tmp12H
258
259        ; -- Final output stage
260
261        movq    mm5,mm1
262        movq    mm3,mm2
263        paddd   mm1,mm6                 ; mm1=data0L
264        paddd   mm2,mm7                 ; mm2=data0H
265        psubd   mm5,mm6                 ; mm5=data3L
266        psubd   mm3,mm7                 ; mm3=data3H
267
268        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm6=[PD_DESCALE_P1_4]
269
270        paddd   mm1,mm6
271        paddd   mm2,mm6
272        psrad   mm1,DESCALE_P1_4
273        psrad   mm2,DESCALE_P1_4
274        paddd   mm5,mm6
275        paddd   mm3,mm6
276        psrad   mm5,DESCALE_P1_4
277        psrad   mm3,DESCALE_P1_4
278
279        packssdw  mm1,mm2               ; mm1=data0=(00 01 02 03)
280        packssdw  mm5,mm3               ; mm5=data3=(30 31 32 33)
281
282        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
283        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
284
285        movq    mm2,mm4
286        movq    mm3,mm0
287        paddd   mm4,mm7                 ; mm4=data1L
288        paddd   mm0,mm6                 ; mm0=data1H
289        psubd   mm2,mm7                 ; mm2=data2L
290        psubd   mm3,mm6                 ; mm3=data2H
291
292        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm7=[PD_DESCALE_P1_4]
293
294        paddd   mm4,mm7
295        paddd   mm0,mm7
296        psrad   mm4,DESCALE_P1_4
297        psrad   mm0,DESCALE_P1_4
298        paddd   mm2,mm7
299        paddd   mm3,mm7
300        psrad   mm2,DESCALE_P1_4
301        psrad   mm3,DESCALE_P1_4
302
303        packssdw  mm4,mm0               ; mm4=data1=(10 11 12 13)
304        packssdw  mm2,mm3               ; mm2=data2=(20 21 22 23)
305
306        movq      mm6,mm1               ; transpose coefficients(phase 1)
307        punpcklwd mm1,mm4               ; mm1=(00 10 01 11)
308        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
309        movq      mm7,mm2               ; transpose coefficients(phase 1)
310        punpcklwd mm2,mm5               ; mm2=(20 30 21 31)
311        punpckhwd mm7,mm5               ; mm7=(22 32 23 33)
312
313        movq      mm0,mm1               ; transpose coefficients(phase 2)
314        punpckldq mm1,mm2               ; mm1=(00 10 20 30)
315        punpckhdq mm0,mm2               ; mm0=(01 11 21 31)
316        movq      mm3,mm6               ; transpose coefficients(phase 2)
317        punpckldq mm6,mm7               ; mm6=(02 12 22 32)
318        punpckhdq mm3,mm7               ; mm3=(03 13 23 33)
319
320        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
321        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
322        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
323        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
324
325.nextcolumn:
326        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
327        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
328        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
329        dec     ecx                                     ; ctr
330        jnz     near .columnloop
331
332        ; ---- Pass 2: process rows from work array, store into output array.
333
334        mov     eax, [original_ebp]
335        lea     esi, [workspace]                        ; JCOEF * wsptr
336        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
337        mov     eax, JDIMENSION [output_col(eax)]
338
339        ; -- Odd part
340
341        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
342        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
343        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
344        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
345
346        movq      mm4,mm0
347        movq      mm5,mm0
348        punpcklwd mm4,mm1
349        punpckhwd mm5,mm1
350        movq      mm0,mm4
351        movq      mm1,mm5
352        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
353        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
354        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
355        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
356
357        movq      mm6,mm2
358        movq      mm7,mm2
359        punpcklwd mm6,mm3
360        punpckhwd mm7,mm3
361        movq      mm2,mm6
362        movq      mm3,mm7
363        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
364        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
365        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
366        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
367
368        paddd   mm6,mm4                 ; mm6=tmp2L
369        paddd   mm7,mm5                 ; mm7=tmp2H
370        paddd   mm2,mm0                 ; mm2=tmp0L
371        paddd   mm3,mm1                 ; mm3=tmp0H
372
373        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
374        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
375
376        ; -- Even part
377
378        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
379        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
380        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
381
382        pxor      mm1,mm1
383        pxor      mm2,mm2
384        punpcklwd mm1,mm4               ; mm1=tmp0L
385        punpckhwd mm2,mm4               ; mm2=tmp0H
386        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
387        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
388
389        movq      mm3,mm5               ; mm5=in2=z2
390        punpcklwd mm5,mm0               ; mm0=in6=z3
391        punpckhwd mm3,mm0
392        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
393        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
394
395        movq    mm4,mm1
396        movq    mm0,mm2
397        paddd   mm1,mm5                 ; mm1=tmp10L
398        paddd   mm2,mm3                 ; mm2=tmp10H
399        psubd   mm4,mm5                 ; mm4=tmp12L
400        psubd   mm0,mm3                 ; mm0=tmp12H
401
402        ; -- Final output stage
403
404        movq    mm5,mm1
405        movq    mm3,mm2
406        paddd   mm1,mm6                 ; mm1=data0L
407        paddd   mm2,mm7                 ; mm2=data0H
408        psubd   mm5,mm6                 ; mm5=data3L
409        psubd   mm3,mm7                 ; mm3=data3H
410
411        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm6=[PD_DESCALE_P2_4]
412
413        paddd   mm1,mm6
414        paddd   mm2,mm6
415        psrad   mm1,DESCALE_P2_4
416        psrad   mm2,DESCALE_P2_4
417        paddd   mm5,mm6
418        paddd   mm3,mm6
419        psrad   mm5,DESCALE_P2_4
420        psrad   mm3,DESCALE_P2_4
421
422        packssdw  mm1,mm2               ; mm1=data0=(00 10 20 30)
423        packssdw  mm5,mm3               ; mm5=data3=(03 13 23 33)
424
425        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
426        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
427
428        movq    mm2,mm4
429        movq    mm3,mm0
430        paddd   mm4,mm7                 ; mm4=data1L
431        paddd   mm0,mm6                 ; mm0=data1H
432        psubd   mm2,mm7                 ; mm2=data2L
433        psubd   mm3,mm6                 ; mm3=data2H
434
435        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm7=[PD_DESCALE_P2_4]
436
437        paddd   mm4,mm7
438        paddd   mm0,mm7
439        psrad   mm4,DESCALE_P2_4
440        psrad   mm0,DESCALE_P2_4
441        paddd   mm2,mm7
442        paddd   mm3,mm7
443        psrad   mm2,DESCALE_P2_4
444        psrad   mm3,DESCALE_P2_4
445
446        packssdw  mm4,mm0               ; mm4=data1=(01 11 21 31)
447        packssdw  mm2,mm3               ; mm2=data2=(02 12 22 32)
448
449        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
450
451        packsswb  mm1,mm2               ; mm1=(00 10 20 30 02 12 22 32)
452        packsswb  mm4,mm5               ; mm4=(01 11 21 31 03 13 23 33)
453        paddb     mm1,mm6
454        paddb     mm4,mm6
455
456        movq      mm7,mm1               ; transpose coefficients(phase 1)
457        punpcklbw mm1,mm4               ; mm1=(00 01 10 11 20 21 30 31)
458        punpckhbw mm7,mm4               ; mm7=(02 03 12 13 22 23 32 33)
459
460        movq      mm0,mm1               ; transpose coefficients(phase 2)
461        punpcklwd mm1,mm7               ; mm1=(00 01 02 03 10 11 12 13)
462        punpckhwd mm0,mm7               ; mm0=(20 21 22 23 30 31 32 33)
463
464        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
465        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
466        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
467        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
468
469        psrlq   mm1,4*BYTE_BIT
470        psrlq   mm0,4*BYTE_BIT
471
472        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
473        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
474        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
475        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
476
477        emms            ; empty MMX state
478
479        pop     edi
480        pop     esi
481;       pop     edx             ; need not be preserved
482;       pop     ecx             ; need not be preserved
483        poppic  ebx
484        mov     esp,ebp         ; esp <- aligned ebp
485        pop     esp             ; esp <- original ebp
486        pop     ebp
487        ret
488
489
490; --------------------------------------------------------------------------
491;
492; Perform dequantization and inverse DCT on one block of coefficients,
493; producing a reduced-size 2x2 output block.
494;
495; GLOBAL(void)
496; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
497;                     JSAMPARRAY output_buf, JDIMENSION output_col)
498;
499
500%define dct_table(b)    (b)+8           ; void * dct_table
501%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
502%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
503%define output_col(b)   (b)+20          ; JDIMENSION output_col
504
505        align   16
506        global  EXTN(jsimd_idct_2x2_mmx)
507
508EXTN(jsimd_idct_2x2_mmx):
509        push    ebp
510        mov     ebp,esp
511        push    ebx
512;       push    ecx             ; need not be preserved
513;       push    edx             ; need not be preserved
514        push    esi
515        push    edi
516
517        get_GOT ebx             ; get GOT address
518
519        ; ---- Pass 1: process columns from input.
520
521        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
522        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
523
524        ; | input:                  | result:        |
525        ; | 00 01 ** 03 ** 05 ** 07 |                |
526        ; | 10 11 ** 13 ** 15 ** 17 |                |
527        ; | ** ** ** ** ** ** ** ** |                |
528        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
529        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
530        ; | 50 51 ** 53 ** 55 ** 57 |                |
531        ; | ** ** ** ** ** ** ** ** |                |
532        ; | 70 71 ** 73 ** 75 ** 77 |                |
533
534        ; -- Odd part
535
536        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
537        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
538        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
539        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
540        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
541        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
542        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
543        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
544
545        ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
546        ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
547
548        pcmpeqd   mm7,mm7
549        pslld     mm7,WORD_BIT          ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
550
551        movq      mm4,mm0               ; mm4=(10 11 ** 13)
552        movq      mm5,mm2               ; mm5=(50 51 ** 53)
553        punpcklwd mm4,mm1               ; mm4=(10 30 11 31)
554        punpcklwd mm5,mm3               ; mm5=(50 70 51 71)
555        pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
556        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
557
558        psrld   mm0,WORD_BIT            ; mm0=(11 -- 13 --)
559        pand    mm1,mm7                 ; mm1=(-- 31 -- 33)
560        psrld   mm2,WORD_BIT            ; mm2=(51 -- 53 --)
561        pand    mm3,mm7                 ; mm3=(-- 71 -- 73)
562        por     mm0,mm1                 ; mm0=(11 31 13 33)
563        por     mm2,mm3                 ; mm2=(51 71 53 73)
564        pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
565        pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
566
567        paddd   mm4,mm5                 ; mm4=tmp0[col0 col1]
568
569        movq    mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
570        movq    mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
571        pmullw  mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
572        pmullw  mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
573        movq    mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
574        movq    mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
575        pmullw  mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
576        pmullw  mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
577
578        ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
579        ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
580
581        psrld   mm6,WORD_BIT            ; mm6=(15 -- 17 --)
582        pand    mm1,mm7                 ; mm1=(-- 35 -- 37)
583        psrld   mm3,WORD_BIT            ; mm3=(55 -- 57 --)
584        pand    mm5,mm7                 ; mm5=(-- 75 -- 77)
585        por     mm6,mm1                 ; mm6=(15 35 17 37)
586        por     mm3,mm5                 ; mm3=(55 75 57 77)
587        pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
588        pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
589
590        paddd   mm0,mm2                 ; mm0=tmp0[col1 col3]
591        paddd   mm6,mm3                 ; mm6=tmp0[col5 col7]
592
593        ; -- Even part
594
595        movq    mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
596        movq    mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
597        pmullw  mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
598        pmullw  mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
599
600        ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
601
602        movq    mm2,mm1                         ; mm2=(00 01 ** 03)
603        pslld   mm1,WORD_BIT                    ; mm1=(-- 00 -- **)
604        psrad   mm1,(WORD_BIT-CONST_BITS-2)     ; mm1=tmp10[col0 ****]
605
606        pand    mm2,mm7                         ; mm2=(-- 01 -- 03)
607        pand    mm5,mm7                         ; mm5=(-- 05 -- 07)
608        psrad   mm2,(WORD_BIT-CONST_BITS-2)     ; mm2=tmp10[col1 col3]
609        psrad   mm5,(WORD_BIT-CONST_BITS-2)     ; mm5=tmp10[col5 col7]
610
611        ; -- Final output stage
612
613        movq      mm3,mm1
614        paddd     mm1,mm4               ; mm1=data0[col0 ****]=(A0 **)
615        psubd     mm3,mm4               ; mm3=data1[col0 ****]=(B0 **)
616        punpckldq mm1,mm3               ; mm1=(A0 B0)
617
618        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]       ; mm7=[PD_DESCALE_P1_2]
619
620        movq    mm4,mm2
621        movq    mm3,mm5
622        paddd   mm2,mm0                 ; mm2=data0[col1 col3]=(A1 A3)
623        paddd   mm5,mm6                 ; mm5=data0[col5 col7]=(A5 A7)
624        psubd   mm4,mm0                 ; mm4=data1[col1 col3]=(B1 B3)
625        psubd   mm3,mm6                 ; mm3=data1[col5 col7]=(B5 B7)
626
627        paddd   mm1,mm7
628        psrad   mm1,DESCALE_P1_2
629
630        paddd   mm2,mm7
631        paddd   mm5,mm7
632        psrad   mm2,DESCALE_P1_2
633        psrad   mm5,DESCALE_P1_2
634        paddd   mm4,mm7
635        paddd   mm3,mm7
636        psrad   mm4,DESCALE_P1_2
637        psrad   mm3,DESCALE_P1_2
638
639        ; ---- Pass 2: process rows, store into output array.
640
641        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
642        mov     eax, JDIMENSION [output_col(ebp)]
643
644        ; | input:| result:|
645        ; | A0 B0 |        |
646        ; | A1 B1 | C0 C1  |
647        ; | A3 B3 | D0 D1  |
648        ; | A5 B5 |        |
649        ; | A7 B7 |        |
650
651        ; -- Odd part
652
653        packssdw  mm2,mm4               ; mm2=(A1 A3 B1 B3)
654        packssdw  mm5,mm3               ; mm5=(A5 A7 B5 B7)
655        pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
656        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
657
658        paddd     mm2,mm5               ; mm2=tmp0[row0 row1]
659
660        ; -- Even part
661
662        pslld     mm1,(CONST_BITS+2)    ; mm1=tmp10[row0 row1]
663
664        ; -- Final output stage
665
666        movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]     ; mm0=[PD_DESCALE_P2_2]
667
668        movq      mm6,mm1
669        paddd     mm1,mm2               ; mm1=data0[row0 row1]=(C0 C1)
670        psubd     mm6,mm2               ; mm6=data1[row0 row1]=(D0 D1)
671
672        paddd     mm1,mm0
673        paddd     mm6,mm0
674        psrad     mm1,DESCALE_P2_2
675        psrad     mm6,DESCALE_P2_2
676
677        movq      mm7,mm1               ; transpose coefficients
678        punpckldq mm1,mm6               ; mm1=(C0 D0)
679        punpckhdq mm7,mm6               ; mm7=(C1 D1)
680
681        packssdw  mm1,mm7               ; mm1=(C0 D0 C1 D1)
682        packsswb  mm1,mm1               ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
683        paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
684
685        movd    ecx,mm1
686        movd    ebx,mm1                 ; ebx=(C0 D0 C1 D1)
687        shr     ecx,2*BYTE_BIT          ; ecx=(C1 D1 -- --)
688
689        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
690        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
691        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
692        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
693
694        emms            ; empty MMX state
695
696        pop     edi
697        pop     esi
698;       pop     edx             ; need not be preserved
699;       pop     ecx             ; need not be preserved
700        pop     ebx
701        pop     ebp
702        ret
703
704; For some reason, the OS X linker does not honor the request to align the
705; segment unless we do this.
706        align   16
707