• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdcolext.asm - colorspace conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
26;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
27;                            JSAMPARRAY output_buf, int num_rows)
28;
29
30%define out_width(b)    (b)+8           ; JDIMENSION out_width
31%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
32%define input_row(b)    (b)+16          ; JDIMENSION input_row
33%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
34%define num_rows(b)     (b)+24          ; int num_rows
35
36%define original_ebp    ebp+0
37%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
38%define WK_NUM          2
39%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
40
41        align   16
42        global  EXTN(jsimd_ycc_rgb_convert_mmx)
43
44EXTN(jsimd_ycc_rgb_convert_mmx):
45        push    ebp
46        mov     eax,esp                         ; eax = original ebp
47        sub     esp, byte 4
48        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
49        mov     [esp],eax
50        mov     ebp,esp                         ; ebp = aligned ebp
51        lea     esp, [wk(0)]
52        pushpic eax             ; make a room for GOT address
53        push    ebx
54;       push    ecx             ; need not be preserved
55;       push    edx             ; need not be preserved
56        push    esi
57        push    edi
58
59        get_GOT ebx                     ; get GOT address
60        movpic  POINTER [gotptr], ebx   ; save GOT address
61
62        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
63        test    ecx,ecx
64        jz      near .return
65
66        push    ecx
67
68        mov     edi, JSAMPIMAGE [input_buf(eax)]
69        mov     ecx, JDIMENSION [input_row(eax)]
70        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
71        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
72        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
73        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
74        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
75        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
76
77        pop     ecx
78
79        mov     edi, JSAMPARRAY [output_buf(eax)]
80        mov     eax, INT [num_rows(eax)]
81        test    eax,eax
82        jle     near .return
83        alignx  16,7
84.rowloop:
85        push    eax
86        push    edi
87        push    edx
88        push    ebx
89        push    esi
90        push    ecx                     ; col
91
92        mov     esi, JSAMPROW [esi]     ; inptr0
93        mov     ebx, JSAMPROW [ebx]     ; inptr1
94        mov     edx, JSAMPROW [edx]     ; inptr2
95        mov     edi, JSAMPROW [edi]     ; outptr
96        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
97        alignx  16,7
98.columnloop:
99
100        movq    mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
101        movq    mm1, MMWORD [edx]       ; mm1=Cr(01234567)
102
103        pcmpeqw mm4,mm4
104        pcmpeqw mm7,mm7
105        psrlw   mm4,BYTE_BIT
106        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
107        movq    mm0,mm4                 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
108
109        pand    mm4,mm5                 ; mm4=Cb(0246)=CbE
110        psrlw   mm5,BYTE_BIT            ; mm5=Cb(1357)=CbO
111        pand    mm0,mm1                 ; mm0=Cr(0246)=CrE
112        psrlw   mm1,BYTE_BIT            ; mm1=Cr(1357)=CrO
113
114        paddw   mm4,mm7
115        paddw   mm5,mm7
116        paddw   mm0,mm7
117        paddw   mm1,mm7
118
119        ; (Original)
120        ; R = Y                + 1.40200 * Cr
121        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
122        ; B = Y + 1.77200 * Cb
123        ;
124        ; (This implementation)
125        ; R = Y                + 0.40200 * Cr + Cr
126        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
127        ; B = Y - 0.22800 * Cb + Cb + Cb
128
129        movq    mm2,mm4                 ; mm2=CbE
130        movq    mm3,mm5                 ; mm3=CbO
131        paddw   mm4,mm4                 ; mm4=2*CbE
132        paddw   mm5,mm5                 ; mm5=2*CbO
133        movq    mm6,mm0                 ; mm6=CrE
134        movq    mm7,mm1                 ; mm7=CrO
135        paddw   mm0,mm0                 ; mm0=2*CrE
136        paddw   mm1,mm1                 ; mm1=2*CrO
137
138        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbE * -FIX(0.22800))
139        pmulhw  mm5,[GOTOFF(eax,PW_MF0228)]     ; mm5=(2*CbO * -FIX(0.22800))
140        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrE * FIX(0.40200))
141        pmulhw  mm1,[GOTOFF(eax,PW_F0402)]      ; mm1=(2*CrO * FIX(0.40200))
142
143        paddw   mm4,[GOTOFF(eax,PW_ONE)]
144        paddw   mm5,[GOTOFF(eax,PW_ONE)]
145        psraw   mm4,1                   ; mm4=(CbE * -FIX(0.22800))
146        psraw   mm5,1                   ; mm5=(CbO * -FIX(0.22800))
147        paddw   mm0,[GOTOFF(eax,PW_ONE)]
148        paddw   mm1,[GOTOFF(eax,PW_ONE)]
149        psraw   mm0,1                   ; mm0=(CrE * FIX(0.40200))
150        psraw   mm1,1                   ; mm1=(CrO * FIX(0.40200))
151
152        paddw   mm4,mm2
153        paddw   mm5,mm3
154        paddw   mm4,mm2                 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
155        paddw   mm5,mm3                 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
156        paddw   mm0,mm6                 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
157        paddw   mm1,mm7                 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
158
159        movq    MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
160        movq    MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
161
162        movq      mm4,mm2
163        movq      mm5,mm3
164        punpcklwd mm2,mm6
165        punpckhwd mm4,mm6
166        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
167        pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
168        punpcklwd mm3,mm7
169        punpckhwd mm5,mm7
170        pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
171        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
172
173        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
174        paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
175        psrad     mm2,SCALEBITS
176        psrad     mm4,SCALEBITS
177        paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
178        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
179        psrad     mm3,SCALEBITS
180        psrad     mm5,SCALEBITS
181
182        packssdw  mm2,mm4       ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
183        packssdw  mm3,mm5       ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
184        psubw     mm2,mm6       ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
185        psubw     mm3,mm7       ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
186
187        movq      mm5, MMWORD [esi]     ; mm5=Y(01234567)
188
189        pcmpeqw   mm4,mm4
190        psrlw     mm4,BYTE_BIT          ; mm4={0xFF 0x00 0xFF 0x00 ..}
191        pand      mm4,mm5               ; mm4=Y(0246)=YE
192        psrlw     mm5,BYTE_BIT          ; mm5=Y(1357)=YO
193
194        paddw     mm0,mm4               ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
195        paddw     mm1,mm5               ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
196        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
197        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
198
199        paddw     mm2,mm4               ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
200        paddw     mm3,mm5               ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
201        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
202        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
203
204        paddw     mm4, MMWORD [wk(0)]   ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
205        paddw     mm5, MMWORD [wk(1)]   ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
206        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
207        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
208
209%if RGB_PIXELSIZE == 3 ; ---------------
210
211        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
212        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
213        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
214        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
215
216        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
217        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
218        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
219
220        movq      mmG,mmA
221        movq      mmH,mmA
222        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
223        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
224
225        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
226        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
227
228        movq      mmC,mmD
229        movq      mmB,mmD
230        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
231        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
232
233        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
234
235        movq      mmF,mmE
236        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
237        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
238
239        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
240        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
241        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
242
243        cmp     ecx, byte SIZEOF_MMWORD
244        jb      short .column_st16
245
246        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
247        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
248        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
249
250        sub     ecx, byte SIZEOF_MMWORD
251        jz      short .nextrow
252
253        add     esi, byte SIZEOF_MMWORD                 ; inptr0
254        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
255        add     edx, byte SIZEOF_MMWORD                 ; inptr2
256        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
257        jmp     near .columnloop
258        alignx  16,7
259
260.column_st16:
261        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
262        cmp     ecx, byte 2*SIZEOF_MMWORD
263        jb      short .column_st8
264        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
265        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
266        movq    mmA,mmC
267        sub     ecx, byte 2*SIZEOF_MMWORD
268        add     edi, byte 2*SIZEOF_MMWORD
269        jmp     short .column_st4
270.column_st8:
271        cmp     ecx, byte SIZEOF_MMWORD
272        jb      short .column_st4
273        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
274        movq    mmA,mmE
275        sub     ecx, byte SIZEOF_MMWORD
276        add     edi, byte SIZEOF_MMWORD
277.column_st4:
278        movd    eax,mmA
279        cmp     ecx, byte SIZEOF_DWORD
280        jb      short .column_st2
281        mov     DWORD [edi+0*SIZEOF_DWORD], eax
282        psrlq   mmA,DWORD_BIT
283        movd    eax,mmA
284        sub     ecx, byte SIZEOF_DWORD
285        add     edi, byte SIZEOF_DWORD
286.column_st2:
287        cmp     ecx, byte SIZEOF_WORD
288        jb      short .column_st1
289        mov     WORD [edi+0*SIZEOF_WORD], ax
290        shr     eax,WORD_BIT
291        sub     ecx, byte SIZEOF_WORD
292        add     edi, byte SIZEOF_WORD
293.column_st1:
294        cmp     ecx, byte SIZEOF_BYTE
295        jb      short .nextrow
296        mov     BYTE [edi+0*SIZEOF_BYTE], al
297
298%else ; RGB_PIXELSIZE == 4 ; -----------
299
300%ifdef RGBX_FILLER_0XFF
301        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
302        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
303%else
304        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
305        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
306%endif
307        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
308        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
309        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
310        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
311
312        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
313        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
314        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
315        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
316
317        movq      mmC,mmA
318        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
319        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
320        movq      mmG,mmB
321        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
322        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
323
324        movq      mmD,mmA
325        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
326        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
327        movq      mmH,mmC
328        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
329        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
330
331        cmp     ecx, byte SIZEOF_MMWORD
332        jb      short .column_st16
333
334        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
335        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
336        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
337        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
338
339        sub     ecx, byte SIZEOF_MMWORD
340        jz      short .nextrow
341
342        add     esi, byte SIZEOF_MMWORD                 ; inptr0
343        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
344        add     edx, byte SIZEOF_MMWORD                 ; inptr2
345        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
346        jmp     near .columnloop
347        alignx  16,7
348
349.column_st16:
350        cmp     ecx, byte SIZEOF_MMWORD/2
351        jb      short .column_st8
352        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
353        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
354        movq    mmA,mmC
355        movq    mmD,mmH
356        sub     ecx, byte SIZEOF_MMWORD/2
357        add     edi, byte 2*SIZEOF_MMWORD
358.column_st8:
359        cmp     ecx, byte SIZEOF_MMWORD/4
360        jb      short .column_st4
361        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
362        movq    mmA,mmD
363        sub     ecx, byte SIZEOF_MMWORD/4
364        add     edi, byte 1*SIZEOF_MMWORD
365.column_st4:
366        cmp     ecx, byte SIZEOF_MMWORD/8
367        jb      short .nextrow
368        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
369
370%endif ; RGB_PIXELSIZE ; ---------------
371
372        alignx  16,7
373
374.nextrow:
375        pop     ecx
376        pop     esi
377        pop     ebx
378        pop     edx
379        pop     edi
380        pop     eax
381
382        add     esi, byte SIZEOF_JSAMPROW
383        add     ebx, byte SIZEOF_JSAMPROW
384        add     edx, byte SIZEOF_JSAMPROW
385        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
386        dec     eax                             ; num_rows
387        jg      near .rowloop
388
389        emms            ; empty MMX state
390
391.return:
392        pop     edi
393        pop     esi
394;       pop     edx             ; need not be preserved
395;       pop     ecx             ; need not be preserved
396        pop     ebx
397        mov     esp,ebp         ; esp <- aligned ebp
398        pop     esp             ; esp <- original ebp
399        pop     ebp
400        ret
401
402; For some reason, the OS X linker does not honor the request to align the
403; segment unless we do this.
404        align   16
405